diff options
Diffstat (limited to 'drivers/block')
64 files changed, 3915 insertions, 2404 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 5b9d4aaebb81..a97f2c40c640 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -130,7 +130,7 @@ config BLK_DEV_UBD_SYNC kernel command line option. Alternatively, you can say Y here to turn on synchronous operation by default for all block devices. - If you're running a journalling file system (like reiserfs, for + If you're running a journalling file system (like xfs, for example) in your virtual machine, you will want to say Y here. If you care for the safety of the data in your virtual machine, Y is a wise choice too. In all other cases (for example, if you're just @@ -354,6 +354,15 @@ config VIRTIO_BLK This is the virtual block driver for virtio. It can be used with QEMU based VMMs (like KVM or Xen). Say Y or M. +config BLK_DEV_RUST_NULL + tristate "Rust null block driver (Experimental)" + depends on RUST + help + This is the Rust implementation of the null block driver. For now it + is only a minimal stub. + + If unsure, say N. + config BLK_DEV_RBD tristate "Rados block device (RBD)" depends on INET && BLOCK diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 101612cba303..1105a2d4fdcb 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -9,6 +9,9 @@ # needed for trace events ccflags-y += -I$(src) +obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull_mod.o +rnull_mod-y := rnull.o + obj-$(CONFIG_MAC_FLOPPY) += swim3.o obj-$(CONFIG_BLK_DEV_SWIM) += swim_mod.o obj-$(CONFIG_BLK_DEV_FD) += floppy.o diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 2b98114a9fe0..9edd4468f755 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -232,6 +232,7 @@ static DEFINE_MUTEX(amiflop_mutex); static unsigned long int fd_def_df0 = FD_DD_3; /* default for df0 if it doesn't identify */ module_param(fd_def_df0, ulong, 0); +MODULE_DESCRIPTION("Amiga floppy driver"); MODULE_LICENSE("GPL"); /* @@ -1776,10 +1777,13 @@ static const struct blk_mq_ops amiflop_mq_ops = { static int fd_alloc_disk(int drive, int system) { + struct queue_limits lim = { + .features = BLK_FEAT_ROTATIONAL, + }; struct gendisk *disk; int err; - disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL); + disk = blk_mq_alloc_disk(&unit[drive].tag_set, &lim, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); @@ -1815,7 +1819,6 @@ static int fd_alloc_drive(int drive) unit[drive].tag_set.nr_maps = 1; unit[drive].tag_set.queue_depth = 2; unit[drive].tag_set.numa_node = NUMA_NO_NODE; - unit[drive].tag_set.flags = BLK_MQ_F_SHOULD_MERGE; if (blk_mq_alloc_tag_set(&unit[drive].tag_set)) goto out_cleanup_trackbuf; diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index b1b47d88f5db..00b74a845328 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -24,8 +24,8 @@ static DEFINE_MUTEX(aoeblk_mutex); static struct kmem_cache *buf_pool_cache; static struct dentry *aoe_debugfs_dir; -/* GPFS needs a larger value than the default. */ -static int aoe_maxsectors; +/* random default picked from the historic block max_sectors cap */ +static int aoe_maxsectors = 2560; module_param(aoe_maxsectors, int, 0644); MODULE_PARM_DESC(aoe_maxsectors, "When nonzero, set the maximum number of sectors per I/O request"); @@ -334,6 +334,11 @@ aoeblk_gdalloc(void *vp) mempool_t *mp; struct blk_mq_tag_set *set; sector_t ssize; + struct queue_limits lim = { + .max_hw_sectors = aoe_maxsectors, + .io_opt = SZ_2M, + .features = BLK_FEAT_ROTATIONAL, + }; ulong flags; int late = 0; int err; @@ -363,7 +368,6 @@ aoeblk_gdalloc(void *vp) set->nr_hw_queues = 1; set->queue_depth = 128; set->numa_node = NUMA_NO_NODE; - set->flags = BLK_MQ_F_SHOULD_MERGE; err = blk_mq_alloc_tag_set(set); if (err) { pr_err("aoe: cannot allocate tag set for %ld.%d\n", @@ -371,7 +375,7 @@ aoeblk_gdalloc(void *vp) goto err_mempool; } - gd = blk_mq_alloc_disk(set, d); + gd = blk_mq_alloc_disk(set, &lim, d); if (IS_ERR(gd)) { pr_err("aoe: cannot allocate block queue for %ld.%d\n", d->aoemajor, d->aoeminor); @@ -384,14 +388,9 @@ aoeblk_gdalloc(void *vp) WARN_ON(d->flags & DEVFL_TKILL); WARN_ON(d->gd); WARN_ON(d->flags & DEVFL_UP); - /* random number picked from the history block max_sectors cap */ - blk_queue_max_hw_sectors(gd->queue, 2560u); - blk_queue_io_opt(gd->queue, SZ_2M); d->bufpool = mp; d->blkq = gd->queue; d->gd = gd; - if (aoe_maxsectors) - blk_queue_max_hw_sectors(gd->queue, aoe_maxsectors); gd->major = AOE_MAJOR; gd->first_minor = d->sysminor; gd->minors = AOE_PARTITIONS; diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index d7317425be51..92b06d1de4cc 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -14,7 +14,7 @@ #include <linux/workqueue.h> #include <linux/kthread.h> #include <net/net_namespace.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/uio.h> #include "aoe.h" @@ -361,6 +361,7 @@ ata_rw_frameinit(struct frame *f) } ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit; + dev_hold(t->ifp->nd); skb->dev = t->ifp->nd; } @@ -401,6 +402,8 @@ aoecmd_ata_rw(struct aoedev *d) __skb_queue_head_init(&queue); __skb_queue_tail(&queue, skb); aoenet_xmit(&queue); + } else { + dev_put(f->t->ifp->nd); } return 1; } @@ -419,13 +422,16 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu rcu_read_lock(); for_each_netdev_rcu(&init_net, ifp) { dev_hold(ifp); - if (!is_aoe_netif(ifp)) - goto cont; + if (!is_aoe_netif(ifp)) { + dev_put(ifp); + continue; + } skb = new_skb(sizeof *h + sizeof *ch); if (skb == NULL) { printk(KERN_INFO "aoe: skb alloc failure\n"); - goto cont; + dev_put(ifp); + continue; } skb_put(skb, sizeof *h + sizeof *ch); skb->dev = ifp; @@ -440,9 +446,6 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu h->major = cpu_to_be16(aoemajor); h->minor = aoeminor; h->cmd = AOECMD_CFG; - -cont: - dev_put(ifp); } rcu_read_unlock(); } @@ -483,10 +486,13 @@ resend(struct aoedev *d, struct frame *f) memcpy(h->dst, t->addr, sizeof h->dst); memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src); + dev_hold(t->ifp->nd); skb->dev = t->ifp->nd; skb = skb_clone(skb, GFP_ATOMIC); - if (skb == NULL) + if (skb == NULL) { + dev_put(t->ifp->nd); return; + } f->sent = ktime_get(); __skb_queue_head_init(&queue); __skb_queue_tail(&queue, skb); @@ -617,6 +623,8 @@ probe(struct aoetgt *t) __skb_queue_head_init(&queue); __skb_queue_tail(&queue, skb); aoenet_xmit(&queue); + } else { + dev_put(f->t->ifp->nd); } } @@ -1395,6 +1403,7 @@ aoecmd_ata_id(struct aoedev *d) ah->cmdstat = ATA_CMD_ID_ATA; ah->lba3 = 0xa0; + dev_hold(t->ifp->nd); skb->dev = t->ifp->nd; d->rttavg = RTTAVG_INIT; @@ -1404,6 +1413,8 @@ aoecmd_ata_id(struct aoedev *d) skb = skb_clone(skb, GFP_ATOMIC); if (skb) f->sent = ktime_get(); + else + dev_put(t->ifp->nd); return skb; } diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index 3523dd82d7a0..4db7f6ce8ade 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -226,10 +226,11 @@ aoedev_downdev(struct aoedev *d) /* fast fail all pending I/O */ if (d->blkq) { /* UP is cleared, freeze+quiesce to insure all are errored */ - blk_mq_freeze_queue(d->blkq); + unsigned int memflags = blk_mq_freeze_queue(d->blkq); + blk_mq_quiesce_queue(d->blkq); blk_mq_unquiesce_queue(d->blkq); - blk_mq_unfreeze_queue(d->blkq); + blk_mq_unfreeze_queue(d->blkq, memflags); } if (d->gd) diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c index c51ea95bc2ce..66e617664c14 100644 --- a/drivers/block/aoe/aoenet.c +++ b/drivers/block/aoe/aoenet.c @@ -10,7 +10,7 @@ #include <linux/netdevice.h> #include <linux/moduleparam.h> #include <net/net_namespace.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include "aoe.h" #define NECODES 5 @@ -63,6 +63,7 @@ tx(int id) __must_hold(&txlock) pr_warn("aoe: packet could not be sent on %s. %s\n", ifp ? ifp->name : "netif", "consider increasing tx_queue_len"); + dev_put(ifp); spin_lock_irq(&txlock); } return 0; diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 50949207798d..a81ade622a01 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -746,6 +746,7 @@ static int do_format(int drive, int type, struct atari_format_descr *desc) unsigned char *p; int sect, nsect; unsigned long flags; + unsigned int memflags; int ret; if (type) { @@ -758,7 +759,7 @@ static int do_format(int drive, int type, struct atari_format_descr *desc) } q = unit[drive].disk[type]->queue; - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); local_irq_save(flags); @@ -817,7 +818,7 @@ static int do_format(int drive, int type, struct atari_format_descr *desc) ret = FormatError ? -EIO : 0; out: blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); return ret; } @@ -1992,9 +1993,12 @@ static const struct blk_mq_ops ataflop_mq_ops = { static int ataflop_alloc_disk(unsigned int drive, unsigned int type) { + struct queue_limits lim = { + .features = BLK_FEAT_ROTATIONAL, + }; struct gendisk *disk; - disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL); + disk = blk_mq_alloc_disk(&unit[drive].tag_set, &lim, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); @@ -2085,7 +2089,6 @@ static int __init atari_floppy_init (void) unit[i].tag_set.nr_maps = 1; unit[i].tag_set.queue_depth = 2; unit[i].tag_set.numa_node = NUMA_NO_NODE; - unit[i].tag_set.flags = BLK_MQ_F_SHOULD_MERGE; ret = blk_mq_alloc_tag_set(&unit[i].tag_set); if (ret) goto err; @@ -2197,4 +2200,5 @@ static void __exit atari_floppy_exit(void) module_init(atari_floppy_init) module_exit(atari_floppy_exit) +MODULE_DESCRIPTION("Atari floppy driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 970bd6ff38c4..292f127cae0a 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -29,10 +29,7 @@ /* * Each block ramdisk device has a xarray brd_pages of pages that stores - * the pages containing the block device's contents. A brd page's ->index is - * its offset in PAGE_SIZE units. This is similar to, but in no way connected - * with, the kernel's pagecache or buffer cache (which sit above our block - * device). + * the pages containing the block device's contents. */ struct brd_device { int brd_number; @@ -51,15 +48,7 @@ struct brd_device { */ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) { - pgoff_t idx; - struct page *page; - - idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */ - page = xa_load(&brd->brd_pages, idx); - - BUG_ON(page && page->index != idx); - - return page; + return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT); } /* @@ -67,8 +56,8 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) */ static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp) { - pgoff_t idx; - struct page *page, *cur; + pgoff_t idx = sector >> PAGE_SECTORS_SHIFT; + struct page *page; int ret = 0; page = brd_lookup_page(brd, sector); @@ -80,23 +69,16 @@ static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp) return -ENOMEM; xa_lock(&brd->brd_pages); - - idx = sector >> PAGE_SECTORS_SHIFT; - page->index = idx; - - cur = __xa_cmpxchg(&brd->brd_pages, idx, NULL, page, gfp); - - if (unlikely(cur)) { - __free_page(page); - ret = xa_err(cur); - if (!ret && (cur->index != idx)) - ret = -EIO; - } else { + ret = __xa_insert(&brd->brd_pages, idx, page, gfp); + if (!ret) brd->brd_nr_pages++; - } - xa_unlock(&brd->brd_pages); + if (ret < 0) { + __free_page(page); + if (ret == -EBUSY) + ret = 0; + } return ret; } @@ -240,6 +222,25 @@ out: return err; } +static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size) +{ + sector_t aligned_sector = (sector + PAGE_SECTORS) & ~PAGE_SECTORS; + struct page *page; + + size -= (aligned_sector - sector) * SECTOR_SIZE; + xa_lock(&brd->brd_pages); + while (size >= PAGE_SIZE && aligned_sector < rd_size * 2) { + page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT); + if (page) { + __free_page(page); + brd->brd_nr_pages--; + } + aligned_sector += PAGE_SECTORS; + size -= PAGE_SIZE; + } + xa_unlock(&brd->brd_pages); +} + static void brd_submit_bio(struct bio *bio) { struct brd_device *brd = bio->bi_bdev->bd_disk->private_data; @@ -247,6 +248,12 @@ static void brd_submit_bio(struct bio *bio) struct bio_vec bvec; struct bvec_iter iter; + if (unlikely(op_is_discard(bio->bi_opf))) { + brd_do_discard(brd, sector, bio->bi_iter.bi_size); + bio_endio(bio); + return; + } + bio_for_each_segment(bvec, bio, iter) { unsigned int len = bvec.bv_len; int err; @@ -291,6 +298,7 @@ static int max_part = 1; module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); +MODULE_DESCRIPTION("Ram backed block device driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); MODULE_ALIAS("rd"); @@ -310,23 +318,65 @@ __setup("ramdisk_size=", ramdisk_size); * (should share code eventually). */ static LIST_HEAD(brd_devices); +static DEFINE_MUTEX(brd_devices_mutex); static struct dentry *brd_debugfs_dir; +static struct brd_device *brd_find_or_alloc_device(int i) +{ + struct brd_device *brd; + + mutex_lock(&brd_devices_mutex); + list_for_each_entry(brd, &brd_devices, brd_list) { + if (brd->brd_number == i) { + mutex_unlock(&brd_devices_mutex); + return ERR_PTR(-EEXIST); + } + } + + brd = kzalloc(sizeof(*brd), GFP_KERNEL); + if (!brd) { + mutex_unlock(&brd_devices_mutex); + return ERR_PTR(-ENOMEM); + } + brd->brd_number = i; + list_add_tail(&brd->brd_list, &brd_devices); + mutex_unlock(&brd_devices_mutex); + return brd; +} + +static void brd_free_device(struct brd_device *brd) +{ + mutex_lock(&brd_devices_mutex); + list_del(&brd->brd_list); + mutex_unlock(&brd_devices_mutex); + kfree(brd); +} + static int brd_alloc(int i) { struct brd_device *brd; struct gendisk *disk; char buf[DISK_NAME_LEN]; int err = -ENOMEM; - - list_for_each_entry(brd, &brd_devices, brd_list) - if (brd->brd_number == i) - return -EEXIST; - brd = kzalloc(sizeof(*brd), GFP_KERNEL); - if (!brd) - return -ENOMEM; - brd->brd_number = i; - list_add_tail(&brd->brd_list, &brd_devices); + struct queue_limits lim = { + /* + * This is so fdisk will align partitions on 4k, because of + * direct_access API needing 4k alignment, returning a PFN + * (This is only a problem on very small devices <= 4M, + * otherwise fdisk will align on 1M. Regardless this call + * is harmless) + */ + .physical_block_size = PAGE_SIZE, + .max_hw_discard_sectors = UINT_MAX, + .max_discard_segments = 1, + .discard_granularity = PAGE_SIZE, + .features = BLK_FEAT_SYNCHRONOUS | + BLK_FEAT_NOWAIT, + }; + + brd = brd_find_or_alloc_device(i); + if (IS_ERR(brd)) + return PTR_ERR(brd); xa_init(&brd->brd_pages); @@ -335,10 +385,11 @@ static int brd_alloc(int i) debugfs_create_u64(buf, 0444, brd_debugfs_dir, &brd->brd_nr_pages); - disk = brd->brd_disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = brd->brd_disk = blk_alloc_disk(&lim, NUMA_NO_NODE); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); goto out_free_dev; - + } disk->major = RAMDISK_MAJOR; disk->first_minor = i * max_part; disk->minors = max_part; @@ -347,19 +398,6 @@ static int brd_alloc(int i) strscpy(disk->disk_name, buf, DISK_NAME_LEN); set_capacity(disk, rd_size * 2); - /* - * This is so fdisk will align partitions on 4k, because of - * direct_access API needing 4k alignment, returning a PFN - * (This is only a problem on very small devices <= 4M, - * otherwise fdisk will align on 1M. Regardless this call - * is harmless) - */ - blk_queue_physical_block_size(disk->queue, PAGE_SIZE); - - /* Tell the block layer that this is not a rotational device */ - blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); - blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue); - blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue); err = add_disk(disk); if (err) goto out_cleanup_disk; @@ -369,8 +407,7 @@ static int brd_alloc(int i) out_cleanup_disk: put_disk(disk); out_free_dev: - list_del(&brd->brd_list); - kfree(brd); + brd_free_device(brd); return err; } @@ -389,8 +426,7 @@ static void brd_cleanup(void) del_gendisk(brd->brd_disk); put_disk(brd->brd_disk); brd_free_pages(brd); - list_del(&brd->brd_list); - kfree(brd); + brd_free_device(brd); } } @@ -417,16 +453,6 @@ static int __init brd_init(void) { int err, i; - brd_check_and_reset_par(); - - brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); - - for (i = 0; i < rd_nr; i++) { - err = brd_alloc(i); - if (err) - goto out_free; - } - /* * brd module now has a feature to instantiate underlying device * structure on-demand, provided that there is an access dev node. @@ -442,11 +468,18 @@ static int __init brd_init(void) * dynamically. */ + brd_check_and_reset_par(); + + brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); + if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) { err = -EIO; goto out_free; } + for (i = 0; i < rd_nr; i++) + brd_alloc(i); + pr_info("brd: module loaded\n"); return 0; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index c21e3732759e..e21492981f7d 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -297,10 +297,6 @@ struct drbd_epoch { unsigned long flags; }; -/* Prototype declaration of function defined in drbd_receiver.c */ -int drbdd_init(struct drbd_thread *); -int drbd_asender(struct drbd_thread *); - /* drbd_epoch flag bits */ enum { DE_HAVE_BARRIER_NUMBER, @@ -524,9 +520,9 @@ struct drbd_md { struct drbd_backing_dev { struct block_device *backing_bdev; - struct bdev_handle *backing_bdev_handle; + struct file *backing_bdev_file; struct block_device *md_bdev; - struct bdev_handle *md_bdev_handle; + struct file *f_md_bdev; struct drbd_md md; struct disk_conf *disk_conf; /* RCU, for updates: resource->conf_update */ sector_t known_size; /* last known size of that backing device */ @@ -864,7 +860,6 @@ struct drbd_device { struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */ struct list_head net_ee; /* zero-copy network send in progress */ - int next_barrier_nr; struct list_head resync_reads; atomic_t pp_in_use; /* allocated from page pool */ atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ @@ -1369,7 +1364,6 @@ extern struct bio_set drbd_io_bio_set; extern struct mutex resources_mutex; -extern int conn_lowest_minor(struct drbd_connection *connection); extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor); extern void drbd_destroy_device(struct kref *kref); extern void drbd_delete_device(struct drbd_device *device); @@ -1390,9 +1384,6 @@ extern void conn_free_crypto(struct drbd_connection *connection); extern void do_submit(struct work_struct *ws); extern void __drbd_make_request(struct drbd_device *, struct bio *); void drbd_submit_bio(struct bio *bio); -extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req); -extern int is_valid_ar_handle(struct drbd_request *, sector_t); - /* drbd_nl.c */ @@ -1474,7 +1465,6 @@ extern int w_resync_timer(struct drbd_work *, int); extern int w_send_write_hint(struct drbd_work *, int); extern int w_send_dblock(struct drbd_work *, int); extern int w_send_read_req(struct drbd_work *, int); -extern int w_e_reissue(struct drbd_work *, int); extern int w_restart_disk_io(struct drbd_work *, int); extern int w_send_out_of_sync(struct drbd_work *, int); @@ -1488,7 +1478,6 @@ extern int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags); extern int drbd_receiver(struct drbd_thread *thi); extern int drbd_ack_receiver(struct drbd_thread *thi); -extern void drbd_send_ping_wf(struct work_struct *ws); extern void drbd_send_acks_wf(struct work_struct *ws); extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); extern bool drbd_rs_should_slow_down(struct drbd_peer_device *peer_device, sector_t sector, @@ -1504,7 +1493,6 @@ extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request #define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0) #define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1) extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool); -extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 6bc86106c7b2..5bbd312c3e14 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -471,20 +471,6 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) wait_for_completion(&thi->stop); } -int conn_lowest_minor(struct drbd_connection *connection) -{ - struct drbd_peer_device *peer_device; - int vnr = 0, minor = -1; - - rcu_read_lock(); - peer_device = idr_get_next(&connection->peer_devices, &vnr); - if (peer_device) - minor = device_to_minor(peer_device->device); - rcu_read_unlock(); - - return minor; -} - #ifdef CONFIG_SMP /* * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs @@ -1550,7 +1536,7 @@ static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *pa * put_page(); and would cause either a VM_BUG directly, or * __page_cache_release a page that would actually still be referenced * by someone, leading to some obscure delayed Oops somewhere else. */ - if (!drbd_disable_sendpage && sendpage_ok(page)) + if (!drbd_disable_sendpage && sendpages_ok(page, len, offset)) msg.msg_flags |= MSG_NOSIGNAL | MSG_SPLICE_PAGES; drbd_update_congested(peer_device->connection); @@ -2690,6 +2676,17 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig int id; int vnr = adm_ctx->volume; enum drbd_ret_code err = ERR_NOMEM; + struct queue_limits lim = { + /* + * Setting the max_hw_sectors to an odd value of 8kibyte here. + * This triggers a max_bio_size message upon first attach or + * connect. + */ + .max_hw_sectors = DRBD_MAX_BIO_SIZE_SAFE >> 8, + .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | + BLK_FEAT_ROTATIONAL | + BLK_FEAT_STABLE_WRITES, + }; device = minor_to_device(minor); if (device) @@ -2708,9 +2705,11 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig drbd_init_set_defaults(device); - disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = blk_alloc_disk(&lim, NUMA_NO_NODE); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); goto out_no_disk; + } device->vdisk = disk; device->rq_queue = disk->queue; @@ -2725,12 +2724,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig sprintf(disk->disk_name, "drbd%d", minor); disk->private_data = device; - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); - blk_queue_write_cache(disk->queue, true, true); - /* Setting the max_hw_sectors to an odd value of 8kibyte here - This triggers a max_bio_size message upon first attach or connect */ - blk_queue_max_hw_sectors(disk->queue, DRBD_MAX_BIO_SIZE_SAFE >> 8); - device->md_io.page = alloc_page(GFP_KERNEL); if (!device->md_io.page) goto out_no_io_page; @@ -3392,10 +3385,12 @@ void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local) void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local) { unsigned long flags; - if (device->ldev->md.uuid[UI_BITMAP] == 0 && val == 0) + spin_lock_irqsave(&device->ldev->md.uuid_lock, flags); + if (device->ldev->md.uuid[UI_BITMAP] == 0 && val == 0) { + spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags); return; + } - spin_lock_irqsave(&device->ldev->md.uuid_lock, flags); if (val == 0) { drbd_uuid_move_history(device); device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP]; @@ -3415,6 +3410,7 @@ void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local) /** * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() * @device: DRBD device. + * @peer_device: Peer DRBD device. * * Sets all bits in the bitmap and writes the whole bitmap to stable storage. */ @@ -3441,6 +3437,7 @@ int drbd_bmio_set_n_write(struct drbd_device *device, /** * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() * @device: DRBD device. + * @peer_device: Peer DRBD device. * * Clears all bits in the bitmap and writes the whole bitmap to stable storage. */ @@ -3494,6 +3491,7 @@ static int w_bitmap_io(struct drbd_work *w, int unused) * @done: callback to be called after the bitmap IO was performed * @why: Descriptive text of the reason for doing the IO * @flags: Bitmap flags + * @peer_device: Peer DRBD device. * * While IO on the bitmap happens we freeze application IO thus we ensure * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be @@ -3542,6 +3540,7 @@ void drbd_queue_bitmap_io(struct drbd_device *device, * @io_fn: IO callback to be called when bitmap IO is possible * @why: Descriptive text of the reason for doing the IO * @flags: Bitmap flags + * @peer_device: Peer DRBD device. * * freezes application IO while that the actual IO operations runs. This * functions MAY NOT be called from worker context. diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 43747a1aae43..720fc30e2ecc 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -25,7 +25,7 @@ #include "drbd_protocol.h" #include "drbd_req.h" #include "drbd_state_change.h" -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/drbd_limits.h> #include <linux/kthread.h> @@ -1189,9 +1189,31 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc) return 0; } -static void blk_queue_discard_granularity(struct request_queue *q, unsigned int granularity) +static unsigned int drbd_max_peer_bio_size(struct drbd_device *device) { - q->limits.discard_granularity = granularity; + /* + * We may ignore peer limits if the peer is modern enough. From 8.3.8 + * onwards the peer can use multiple BIOs for a single peer_request. + */ + if (device->state.conn < C_WF_REPORT_PARAMS) + return device->peer_max_bio_size; + + if (first_peer_device(device)->connection->agreed_pro_version < 94) + return min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + + /* + * Correct old drbd (up to 8.3.7) if it believes it can do more than + * 32KiB. + */ + if (first_peer_device(device)->connection->agreed_pro_version == 94) + return DRBD_MAX_SIZE_H80_PACKET; + + /* + * drbd 8.3.8 onwards, before 8.4.0 + */ + if (first_peer_device(device)->connection->agreed_pro_version < 100) + return DRBD_MAX_BIO_SIZE_P95; + return DRBD_MAX_BIO_SIZE; } static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection) @@ -1204,149 +1226,119 @@ static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection) return AL_EXTENT_SIZE >> 9; } -static void decide_on_discard_support(struct drbd_device *device, +static bool drbd_discard_supported(struct drbd_connection *connection, struct drbd_backing_dev *bdev) { - struct drbd_connection *connection = - first_peer_device(device)->connection; - struct request_queue *q = device->rq_queue; - unsigned int max_discard_sectors; - if (bdev && !bdev_max_discard_sectors(bdev->backing_bdev)) - goto not_supported; + return false; if (connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) { drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n"); - goto not_supported; + return false; } - /* - * We don't care for the granularity, really. - * - * Stacking limits below should fix it for the local device. Whether or - * not it is a suitable granularity on the remote device is not our - * problem, really. If you care, you need to use devices with similar - * topology on all peers. - */ - blk_queue_discard_granularity(q, 512); - max_discard_sectors = drbd_max_discard_sectors(connection); - blk_queue_max_discard_sectors(q, max_discard_sectors); - blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); - return; - -not_supported: - blk_queue_discard_granularity(q, 0); - blk_queue_max_discard_sectors(q, 0); + return true; } -static void fixup_write_zeroes(struct drbd_device *device, struct request_queue *q) +/* This is the workaround for "bio would need to, but cannot, be split" */ +static unsigned int drbd_backing_dev_max_segments(struct drbd_device *device) { - /* Fixup max_write_zeroes_sectors after blk_stack_limits(): - * if we can handle "zeroes" efficiently on the protocol, - * we want to do that, even if our backend does not announce - * max_write_zeroes_sectors itself. */ - struct drbd_connection *connection = first_peer_device(device)->connection; - /* If the peer announces WZEROES support, use it. Otherwise, rather - * send explicit zeroes than rely on some discard-zeroes-data magic. */ - if (connection->agreed_features & DRBD_FF_WZEROES) - q->limits.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS; - else - q->limits.max_write_zeroes_sectors = 0; -} + unsigned int max_segments; -static void fixup_discard_support(struct drbd_device *device, struct request_queue *q) -{ - unsigned int max_discard = device->rq_queue->limits.max_discard_sectors; - unsigned int discard_granularity = - device->rq_queue->limits.discard_granularity >> SECTOR_SHIFT; + rcu_read_lock(); + max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs; + rcu_read_unlock(); - if (discard_granularity > max_discard) { - blk_queue_discard_granularity(q, 0); - blk_queue_max_discard_sectors(q, 0); - } + if (!max_segments) + return BLK_MAX_SEGMENTS; + return max_segments; } -static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev, - unsigned int max_bio_size, struct o_qlim *o) +void drbd_reconsider_queue_parameters(struct drbd_device *device, + struct drbd_backing_dev *bdev, struct o_qlim *o) { + struct drbd_connection *connection = + first_peer_device(device)->connection; struct request_queue * const q = device->rq_queue; - unsigned int max_hw_sectors = max_bio_size >> 9; - unsigned int max_segments = 0; + unsigned int now = queue_max_hw_sectors(q) << 9; + struct queue_limits lim; struct request_queue *b = NULL; - struct disk_conf *dc; + unsigned int new; if (bdev) { b = bdev->backing_bdev->bd_disk->queue; - max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); - rcu_read_lock(); - dc = rcu_dereference(device->ldev->disk_conf); - max_segments = dc->max_bio_bvecs; - rcu_read_unlock(); - - blk_set_stacking_limits(&q->limits); + device->local_max_bio_size = + queue_max_hw_sectors(b) << SECTOR_SHIFT; } - blk_queue_max_hw_sectors(q, max_hw_sectors); - /* This is the workaround for "bio would need to, but cannot, be split" */ - blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); - blk_queue_segment_boundary(q, PAGE_SIZE-1); - decide_on_discard_support(device, bdev); - - if (b) { - blk_stack_limits(&q->limits, &b->limits, 0); - disk_update_readahead(device->vdisk); + /* + * We may later detach and re-attach on a disconnected Primary. Avoid + * decreasing the value in this case. + * + * We want to store what we know the peer DRBD can handle, not what the + * peer IO backend can handle. + */ + new = min3(DRBD_MAX_BIO_SIZE, device->local_max_bio_size, + max(drbd_max_peer_bio_size(device), device->peer_max_bio_size)); + if (new != now) { + if (device->state.role == R_PRIMARY && new < now) + drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", + new, now); + drbd_info(device, "max BIO size = %u\n", new); } - fixup_write_zeroes(device, q); - fixup_discard_support(device, q); -} - -void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o) -{ - unsigned int now, new, local, peer; - - now = queue_max_hw_sectors(device->rq_queue) << 9; - local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */ - peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */ + lim = queue_limits_start_update(q); if (bdev) { - local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9; - device->local_max_bio_size = local; + blk_set_stacking_limits(&lim); + lim.max_segments = drbd_backing_dev_max_segments(device); + } else { + lim.max_segments = BLK_MAX_SEGMENTS; } - local = min(local, DRBD_MAX_BIO_SIZE); - /* We may ignore peer limits if the peer is modern enough. - Because new from 8.3.8 onwards the peer can use multiple - BIOs for a single peer_request */ - if (device->state.conn >= C_WF_REPORT_PARAMS) { - if (first_peer_device(device)->connection->agreed_pro_version < 94) - peer = min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); - /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ - else if (first_peer_device(device)->connection->agreed_pro_version == 94) - peer = DRBD_MAX_SIZE_H80_PACKET; - else if (first_peer_device(device)->connection->agreed_pro_version < 100) - peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */ - else - peer = DRBD_MAX_BIO_SIZE; + lim.max_hw_sectors = new >> SECTOR_SHIFT; + lim.seg_boundary_mask = PAGE_SIZE - 1; - /* We may later detach and re-attach on a disconnected Primary. - * Avoid this setting to jump back in that case. - * We want to store what we know the peer DRBD can handle, - * not what the peer IO backend can handle. */ - if (peer > device->peer_max_bio_size) - device->peer_max_bio_size = peer; + /* + * We don't care for the granularity, really. + * + * Stacking limits below should fix it for the local device. Whether or + * not it is a suitable granularity on the remote device is not our + * problem, really. If you care, you need to use devices with similar + * topology on all peers. + */ + if (drbd_discard_supported(connection, bdev)) { + lim.discard_granularity = 512; + lim.max_hw_discard_sectors = + drbd_max_discard_sectors(connection); + } else { + lim.discard_granularity = 0; + lim.max_hw_discard_sectors = 0; } - new = min(local, peer); - if (device->state.role == R_PRIMARY && new < now) - drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", new, now); + if (bdev) + blk_stack_limits(&lim, &b->limits, 0); - if (new != now) - drbd_info(device, "max BIO size = %u\n", new); + /* + * If we can handle "zeroes" efficiently on the protocol, we want to do + * that, even if our backend does not announce max_write_zeroes_sectors + * itself. + */ + if (connection->agreed_features & DRBD_FF_WZEROES) + lim.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS; + else + lim.max_write_zeroes_sectors = 0; + + if ((lim.discard_granularity >> SECTOR_SHIFT) > + lim.max_hw_discard_sectors) { + lim.discard_granularity = 0; + lim.max_hw_discard_sectors = 0; + } - drbd_setup_queue_param(device, bdev, new, o); + if (queue_limits_commit_update(q, &lim)) + drbd_err(device, "setting new queue limits failed\n"); } /* Starts the worker thread */ @@ -1635,45 +1627,45 @@ success: return 0; } -static struct bdev_handle *open_backing_dev(struct drbd_device *device, +static struct file *open_backing_dev(struct drbd_device *device, const char *bdev_path, void *claim_ptr, bool do_bd_link) { - struct bdev_handle *handle; + struct file *file; int err = 0; - handle = bdev_open_by_path(bdev_path, BLK_OPEN_READ | BLK_OPEN_WRITE, - claim_ptr, NULL); - if (IS_ERR(handle)) { + file = bdev_file_open_by_path(bdev_path, BLK_OPEN_READ | BLK_OPEN_WRITE, + claim_ptr, NULL); + if (IS_ERR(file)) { drbd_err(device, "open(\"%s\") failed with %ld\n", - bdev_path, PTR_ERR(handle)); - return handle; + bdev_path, PTR_ERR(file)); + return file; } if (!do_bd_link) - return handle; + return file; - err = bd_link_disk_holder(handle->bdev, device->vdisk); + err = bd_link_disk_holder(file_bdev(file), device->vdisk); if (err) { - bdev_release(handle); + fput(file); drbd_err(device, "bd_link_disk_holder(\"%s\", ...) failed with %d\n", bdev_path, err); - handle = ERR_PTR(err); + file = ERR_PTR(err); } - return handle; + return file; } static int open_backing_devices(struct drbd_device *device, struct disk_conf *new_disk_conf, struct drbd_backing_dev *nbc) { - struct bdev_handle *handle; + struct file *file; - handle = open_backing_dev(device, new_disk_conf->backing_dev, device, + file = open_backing_dev(device, new_disk_conf->backing_dev, device, true); - if (IS_ERR(handle)) + if (IS_ERR(file)) return ERR_OPEN_DISK; - nbc->backing_bdev = handle->bdev; - nbc->backing_bdev_handle = handle; + nbc->backing_bdev = file_bdev(file); + nbc->backing_bdev_file = file; /* * meta_dev_idx >= 0: external fixed size, possibly multiple @@ -1683,7 +1675,7 @@ static int open_backing_devices(struct drbd_device *device, * should check it for you already; but if you don't, or * someone fooled it, we need to double check here) */ - handle = open_backing_dev(device, new_disk_conf->meta_dev, + file = open_backing_dev(device, new_disk_conf->meta_dev, /* claim ptr: device, if claimed exclusively; shared drbd_m_holder, * if potentially shared with other drbd minors */ (new_disk_conf->meta_dev_idx < 0) ? (void*)device : (void*)drbd_m_holder, @@ -1691,21 +1683,21 @@ static int open_backing_devices(struct drbd_device *device, * as would happen with internal metadata. */ (new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_FLEX_INT && new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_INTERNAL)); - if (IS_ERR(handle)) + if (IS_ERR(file)) return ERR_OPEN_MD_DISK; - nbc->md_bdev = handle->bdev; - nbc->md_bdev_handle = handle; + nbc->md_bdev = file_bdev(file); + nbc->f_md_bdev = file; return NO_ERROR; } static void close_backing_dev(struct drbd_device *device, - struct bdev_handle *handle, bool do_bd_unlink) + struct file *bdev_file, bool do_bd_unlink) { - if (!handle) + if (!bdev_file) return; if (do_bd_unlink) - bd_unlink_disk_holder(handle->bdev, device->vdisk); - bdev_release(handle); + bd_unlink_disk_holder(file_bdev(bdev_file), device->vdisk); + fput(bdev_file); } void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev) @@ -1713,9 +1705,9 @@ void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev * if (ldev == NULL) return; - close_backing_dev(device, ldev->md_bdev_handle, + close_backing_dev(device, ldev->f_md_bdev, ldev->md_bdev != ldev->backing_bdev); - close_backing_dev(device, ldev->backing_bdev_handle, true); + close_backing_dev(device, ldev->backing_bdev_file, true); kfree(ldev->disk_conf); kfree(ldev); @@ -2131,9 +2123,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) fail: conn_reconfig_done(connection); if (nbc) { - close_backing_dev(device, nbc->md_bdev_handle, + close_backing_dev(device, nbc->f_md_bdev, nbc->md_bdev != nbc->backing_bdev); - close_backing_dev(device, nbc->backing_bdev_handle, true); + close_backing_dev(device, nbc->backing_bdev_file, true); kfree(nbc); } kfree(new_disk_conf); diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 287a8d1d3f70..c2b6c4d9729d 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -876,7 +876,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns) ns.disk == D_OUTDATED) rv = SS_CONNECTED_OUTDATES; - else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + else if (nc && (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && (nc->verify_alg[0] == 0)) rv = SS_NO_VERIFY_ALG; @@ -1542,9 +1542,10 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, int notify_resource_state_change(struct sk_buff *skb, unsigned int seq, - struct drbd_resource_state_change *resource_state_change, + void *state_change, enum drbd_notification_type type) { + struct drbd_resource_state_change *resource_state_change = state_change; struct drbd_resource *resource = resource_state_change->resource; struct resource_info resource_info = { .res_role = resource_state_change->role[NEW], @@ -1558,13 +1559,14 @@ int notify_resource_state_change(struct sk_buff *skb, int notify_connection_state_change(struct sk_buff *skb, unsigned int seq, - struct drbd_connection_state_change *connection_state_change, + void *state_change, enum drbd_notification_type type) { - struct drbd_connection *connection = connection_state_change->connection; + struct drbd_connection_state_change *p = state_change; + struct drbd_connection *connection = p->connection; struct connection_info connection_info = { - .conn_connection_state = connection_state_change->cstate[NEW], - .conn_role = connection_state_change->peer_role[NEW], + .conn_connection_state = p->cstate[NEW], + .conn_role = p->peer_role[NEW], }; return notify_connection_state(skb, seq, connection, &connection_info, type); @@ -1572,9 +1574,10 @@ int notify_connection_state_change(struct sk_buff *skb, int notify_device_state_change(struct sk_buff *skb, unsigned int seq, - struct drbd_device_state_change *device_state_change, + void *state_change, enum drbd_notification_type type) { + struct drbd_device_state_change *device_state_change = state_change; struct drbd_device *device = device_state_change->device; struct device_info device_info = { .dev_disk_state = device_state_change->disk_state[NEW], @@ -1585,9 +1588,10 @@ int notify_device_state_change(struct sk_buff *skb, int notify_peer_device_state_change(struct sk_buff *skb, unsigned int seq, - struct drbd_peer_device_state_change *p, + void *state_change, enum drbd_notification_type type) { + struct drbd_peer_device_state_change *p = state_change; struct drbd_peer_device *peer_device = p->peer_device; struct peer_device_info peer_device_info = { .peer_repl_state = p->repl_state[NEW], @@ -1605,8 +1609,8 @@ static void broadcast_state_change(struct drbd_state_change *state_change) struct drbd_resource_state_change *resource_state_change = &state_change->resource[0]; bool resource_state_has_changed; unsigned int n_device, n_connection, n_peer_device, n_peer_devices; - int (*last_func)(struct sk_buff *, unsigned int, void *, - enum drbd_notification_type) = NULL; + int (*last_func)(struct sk_buff *, unsigned int, + void *, enum drbd_notification_type) = NULL; void *last_arg = NULL; #define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW]) @@ -1616,7 +1620,7 @@ static void broadcast_state_change(struct drbd_state_change *state_change) }) #define REMEMBER_STATE_CHANGE(func, arg, type) \ ({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \ - last_func = (typeof(last_func))func; \ + last_func = func; \ last_arg = arg; \ }) diff --git a/drivers/block/drbd/drbd_state_change.h b/drivers/block/drbd/drbd_state_change.h index 9d78d8e3912e..a56a57d67686 100644 --- a/drivers/block/drbd/drbd_state_change.h +++ b/drivers/block/drbd/drbd_state_change.h @@ -46,19 +46,19 @@ extern void forget_state_change(struct drbd_state_change *); extern int notify_resource_state_change(struct sk_buff *, unsigned int, - struct drbd_resource_state_change *, + void *, enum drbd_notification_type type); extern int notify_connection_state_change(struct sk_buff *, unsigned int, - struct drbd_connection_state_change *, + void *, enum drbd_notification_type type); extern int notify_device_state_change(struct sk_buff *, unsigned int, - struct drbd_device_state_change *, + void *, enum drbd_notification_type type); extern int notify_peer_device_state_change(struct sk_buff *, unsigned int, - struct drbd_peer_device_state_change *, + void *, enum drbd_notification_type type); #endif /* DRBD_STATE_CHANGE_H */ diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index d0e41d52d6a9..abf0486f0d4f 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -530,14 +530,13 @@ static struct format_descr format_req; static char *floppy_track_buffer; static int max_buffer_sectors; -typedef void (*done_f)(int); static const struct cont_t { void (*interrupt)(void); /* this is called after the interrupt of the * main command */ void (*redo)(void); /* this is called to retry the operation */ void (*error)(void); /* this is called to tally an error */ - done_f done; /* this is called to say if the operation has + void (*done)(int); /* this is called to say if the operation has * succeeded/failed */ } *cont; @@ -985,6 +984,10 @@ static void empty(void) { } +static void empty_done(int result) +{ +} + static void (*floppy_work_fn)(void); static void floppy_work_workfn(struct work_struct *work) @@ -1998,14 +2001,14 @@ static const struct cont_t wakeup_cont = { .interrupt = empty, .redo = do_wakeup, .error = empty, - .done = (done_f)empty + .done = empty_done, }; static const struct cont_t intr_cont = { .interrupt = empty, .redo = process_fd_request, .error = empty, - .done = (done_f)empty + .done = empty_done, }; /* schedules handler, waiting for completion. May be interrupted, will then @@ -2784,7 +2787,6 @@ do_request: pending = set_next_request(); spin_unlock_irq(&floppy_lock); if (!pending) { - do_floppy = NULL; unlock_fdc(); return; } @@ -4513,13 +4515,16 @@ static bool floppy_available(int drive) static int floppy_alloc_disk(unsigned int drive, unsigned int type) { + struct queue_limits lim = { + .max_hw_sectors = 64, + .features = BLK_FEAT_ROTATIONAL, + }; struct gendisk *disk; - disk = blk_mq_alloc_disk(&tag_sets[drive], NULL); + disk = blk_mq_alloc_disk(&tag_sets[drive], &lim, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); - blk_queue_max_hw_sectors(disk->queue, 64); disk->major = FLOPPY_MAJOR; disk->first_minor = TOMINOR(drive) | (type << 2); disk->minors = 1; @@ -4591,7 +4596,6 @@ static int __init do_floppy_init(void) tag_sets[drive].nr_maps = 1; tag_sets[drive].queue_depth = 2; tag_sets[drive].numa_node = NUMA_NO_NODE; - tag_sets[drive].flags = BLK_MQ_F_SHOULD_MERGE; err = blk_mq_alloc_tag_set(&tag_sets[drive]); if (err) goto out_put_disk; @@ -5012,6 +5016,7 @@ module_param(floppy, charp, 0); module_param(FLOPPY_IRQ, int, 0); module_param(FLOPPY_DMA, int, 0); MODULE_AUTHOR("Alain L. Knaff"); +MODULE_DESCRIPTION("Normal floppy disk support"); MODULE_LICENSE("GPL"); /* This doesn't actually get used other than for module information */ diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f8145499da38..c05fe27a96b6 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -68,7 +68,6 @@ struct loop_device { struct list_head idle_worker_list; struct rb_root worker_tree; struct timer_list timer; - bool use_dio; bool sysfs_inited; struct request_queue *lo_queue; @@ -173,7 +172,7 @@ static loff_t get_loop_size(struct loop_device *lo, struct file *file) static bool lo_bdev_can_use_dio(struct loop_device *lo, struct block_device *backing_bdev) { - unsigned short sb_bsize = bdev_logical_block_size(backing_bdev); + unsigned int sb_bsize = bdev_logical_block_size(backing_bdev); if (queue_logical_block_size(lo->lo_queue) < sb_bsize) return false; @@ -182,44 +181,44 @@ static bool lo_bdev_can_use_dio(struct loop_device *lo, return true; } -static void __loop_update_dio(struct loop_device *lo, bool dio) +static bool lo_can_use_dio(struct loop_device *lo) { - struct file *file = lo->lo_backing_file; - struct inode *inode = file->f_mapping->host; - struct block_device *backing_bdev = NULL; - bool use_dio; + struct inode *inode = lo->lo_backing_file->f_mapping->host; - if (S_ISBLK(inode->i_mode)) - backing_bdev = I_BDEV(inode); - else if (inode->i_sb->s_bdev) - backing_bdev = inode->i_sb->s_bdev; + if (!(lo->lo_backing_file->f_mode & FMODE_CAN_ODIRECT)) + return false; - use_dio = dio && (file->f_mode & FMODE_CAN_ODIRECT) && - (!backing_bdev || lo_bdev_can_use_dio(lo, backing_bdev)); + if (S_ISBLK(inode->i_mode)) + return lo_bdev_can_use_dio(lo, I_BDEV(inode)); + if (inode->i_sb->s_bdev) + return lo_bdev_can_use_dio(lo, inode->i_sb->s_bdev); + return true; +} - if (lo->use_dio == use_dio) - return; +/* + * Direct I/O can be enabled either by using an O_DIRECT file descriptor, or by + * passing in the LO_FLAGS_DIRECT_IO flag from userspace. It will be silently + * disabled when the device block size is too small or the offset is unaligned. + * + * loop_get_status will always report the effective LO_FLAGS_DIRECT_IO flag and + * not the originally passed in one. + */ +static inline void loop_update_dio(struct loop_device *lo) +{ + bool dio_in_use = lo->lo_flags & LO_FLAGS_DIRECT_IO; - /* flush dirty pages before changing direct IO */ - vfs_fsync(file, 0); + lockdep_assert_held(&lo->lo_mutex); + WARN_ON_ONCE(lo->lo_state == Lo_bound && + lo->lo_queue->mq_freeze_depth == 0); - /* - * The flag of LO_FLAGS_DIRECT_IO is handled similarly with - * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup - * will get updated by ioctl(LOOP_GET_STATUS) - */ - if (lo->lo_state == Lo_bound) - blk_mq_freeze_queue(lo->lo_queue); - lo->use_dio = use_dio; - if (use_dio) { - blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, lo->lo_queue); + if (lo->lo_backing_file->f_flags & O_DIRECT) lo->lo_flags |= LO_FLAGS_DIRECT_IO; - } else { - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue); + if ((lo->lo_flags & LO_FLAGS_DIRECT_IO) && !lo_can_use_dio(lo)) lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; - } - if (lo->lo_state == Lo_bound) - blk_mq_unfreeze_queue(lo->lo_queue); + + /* flush dirty pages before starting to issue direct I/O */ + if ((lo->lo_flags & LO_FLAGS_DIRECT_IO) && !dio_in_use) + vfs_fsync(lo->lo_backing_file, 0); } /** @@ -302,6 +301,28 @@ static int lo_read_simple(struct loop_device *lo, struct request *rq, return 0; } +static void loop_clear_limits(struct loop_device *lo, int mode) +{ + struct queue_limits lim = queue_limits_start_update(lo->lo_queue); + + if (mode & FALLOC_FL_ZERO_RANGE) + lim.max_write_zeroes_sectors = 0; + + if (mode & FALLOC_FL_PUNCH_HOLE) { + lim.max_hw_discard_sectors = 0; + lim.discard_granularity = 0; + } + + /* + * XXX: this updates the queue limits without freezing the queue, which + * is against the locking protocol and dangerous. But we can't just + * freeze the queue as we're inside the ->queue_rq method here. So this + * should move out into a workqueue unless we get the file operations to + * advertise if they support specific fallocate operations. + */ + queue_limits_commit_update(lo->lo_queue, &lim); +} + static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, int mode) { @@ -320,6 +341,14 @@ static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq)); if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) return -EIO; + + /* + * We initially configure the limits in a hope that fallocate is + * supported and clear them here if that turns out not to be true. + */ + if (unlikely(ret == -EOPNOTSUPP)) + loop_clear_limits(lo, mode); + return ret; } @@ -445,9 +474,9 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); if (rw == ITER_SOURCE) - ret = call_write_iter(file, &cmd->iocb, &iter); + ret = file->f_op->write_iter(&cmd->iocb, &iter); else - ret = call_read_iter(file, &cmd->iocb, &iter); + ret = file->f_op->read_iter(&cmd->iocb, &iter); lo_rw_aio_do_completion(cmd); @@ -500,12 +529,6 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) } } -static inline void loop_update_dio(struct loop_device *lo) -{ - __loop_update_dio(lo, (lo->lo_backing_file->f_flags & O_DIRECT) | - lo->use_dio); -} - static void loop_reread_partitions(struct loop_device *lo) { int rc; @@ -563,6 +586,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, { struct file *file = fget(arg); struct file *old_file; + unsigned int memflags; int error; bool partscan; bool is_loop; @@ -600,14 +624,14 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, /* and ... switch */ disk_force_media_change(lo->lo_disk); - blk_mq_freeze_queue(lo->lo_queue); + memflags = blk_mq_freeze_queue(lo->lo_queue); mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); lo->lo_backing_file = file; lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping); mapping_set_gfp_mask(file->f_mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); loop_update_dio(lo); - blk_mq_unfreeze_queue(lo->lo_queue); + blk_mq_unfreeze_queue(lo->lo_queue, memflags); partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; loop_global_unlock(lo, is_loop); @@ -750,12 +774,12 @@ static void loop_sysfs_exit(struct loop_device *lo) &loop_attribute_group); } -static void loop_config_discard(struct loop_device *lo) +static void loop_get_discard_config(struct loop_device *lo, + u32 *granularity, u32 *max_discard_sectors) { struct file *file = lo->lo_backing_file; struct inode *inode = file->f_mapping->host; - struct request_queue *q = lo->lo_queue; - u32 granularity, max_discard_sectors; + struct kstatfs sbuf; /* * If the backing device is a block device, mirror its zeroing @@ -765,38 +789,18 @@ static void loop_config_discard(struct loop_device *lo) * file-backed loop devices: discarded regions read back as zero. */ if (S_ISBLK(inode->i_mode)) { - struct request_queue *backingq = bdev_get_queue(I_BDEV(inode)); + struct block_device *bdev = I_BDEV(inode); - max_discard_sectors = backingq->limits.max_write_zeroes_sectors; - granularity = bdev_discard_granularity(I_BDEV(inode)) ?: - queue_physical_block_size(backingq); + *max_discard_sectors = bdev_write_zeroes_sectors(bdev); + *granularity = bdev_discard_granularity(bdev); /* * We use punch hole to reclaim the free space used by the * image a.k.a. discard. */ - } else if (!file->f_op->fallocate) { - max_discard_sectors = 0; - granularity = 0; - - } else { - struct kstatfs sbuf; - - max_discard_sectors = UINT_MAX >> 9; - if (!vfs_statfs(&file->f_path, &sbuf)) - granularity = sbuf.f_bsize; - else - max_discard_sectors = 0; - } - - if (max_discard_sectors) { - q->limits.discard_granularity = granularity; - blk_queue_max_discard_sectors(q, max_discard_sectors); - blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); - } else { - q->limits.discard_granularity = 0; - blk_queue_max_discard_sectors(q, 0); - blk_queue_max_write_zeroes_sectors(q, 0); + } else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) { + *max_discard_sectors = UINT_MAX >> 9; + *granularity = sbuf.f_bsize; } } @@ -927,24 +931,6 @@ static void loop_free_idle_workers_timer(struct timer_list *timer) return loop_free_idle_workers(lo, false); } -static void loop_update_rotational(struct loop_device *lo) -{ - struct file *file = lo->lo_backing_file; - struct inode *file_inode = file->f_mapping->host; - struct block_device *file_bdev = file_inode->i_sb->s_bdev; - struct request_queue *q = lo->lo_queue; - bool nonrot = true; - - /* not all filesystems (e.g. tmpfs) have a sb->s_bdev */ - if (file_bdev) - nonrot = bdev_nonrot(file_bdev); - - if (nonrot) - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - else - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); -} - /** * loop_set_status_from_info - configure device from loop_info * @lo: struct loop_device to configure @@ -982,21 +968,62 @@ loop_set_status_from_info(struct loop_device *lo, memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); lo->lo_file_name[LO_NAME_SIZE-1] = 0; - lo->lo_flags = info->lo_flags; return 0; } +static unsigned int loop_default_blocksize(struct loop_device *lo, + struct block_device *backing_bdev) +{ + /* In case of direct I/O, match underlying block size */ + if ((lo->lo_backing_file->f_flags & O_DIRECT) && backing_bdev) + return bdev_logical_block_size(backing_bdev); + return SECTOR_SIZE; +} + +static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim, + unsigned int bsize) +{ + struct file *file = lo->lo_backing_file; + struct inode *inode = file->f_mapping->host; + struct block_device *backing_bdev = NULL; + u32 granularity = 0, max_discard_sectors = 0; + + if (S_ISBLK(inode->i_mode)) + backing_bdev = I_BDEV(inode); + else if (inode->i_sb->s_bdev) + backing_bdev = inode->i_sb->s_bdev; + + if (!bsize) + bsize = loop_default_blocksize(lo, backing_bdev); + + loop_get_discard_config(lo, &granularity, &max_discard_sectors); + + lim->logical_block_size = bsize; + lim->physical_block_size = bsize; + lim->io_min = bsize; + lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL); + if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY)) + lim->features |= BLK_FEAT_WRITE_CACHE; + if (backing_bdev && !bdev_nonrot(backing_bdev)) + lim->features |= BLK_FEAT_ROTATIONAL; + lim->max_hw_discard_sectors = max_discard_sectors; + lim->max_write_zeroes_sectors = max_discard_sectors; + if (max_discard_sectors) + lim->discard_granularity = granularity; + else + lim->discard_granularity = 0; +} + static int loop_configure(struct loop_device *lo, blk_mode_t mode, struct block_device *bdev, const struct loop_config *config) { struct file *file = fget(config->fd); - struct inode *inode; struct address_space *mapping; + struct queue_limits lim; int error; loff_t size; bool partscan; - unsigned short bsize; bool is_loop; if (!file) @@ -1029,22 +1056,16 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, goto out_unlock; mapping = file->f_mapping; - inode = mapping->host; if ((config->info.lo_flags & ~LOOP_CONFIGURE_SETTABLE_FLAGS) != 0) { error = -EINVAL; goto out_unlock; } - if (config->block_size) { - error = blk_validate_block_size(config->block_size); - if (error) - goto out_unlock; - } - error = loop_set_status_from_info(lo, &config->info); if (error) goto out_unlock; + lo->lo_flags = config->info.lo_flags; if (!(file->f_mode & FMODE_WRITE) || !(mode & BLK_OPEN_WRITE) || !file->f_op->write_iter) @@ -1066,29 +1087,18 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, disk_force_media_change(lo->lo_disk); set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); - lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO; lo->lo_device = bdev; lo->lo_backing_file = file; lo->old_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); - if (!(lo->lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) - blk_queue_write_cache(lo->lo_queue, true, false); - - if (config->block_size) - bsize = config->block_size; - else if ((lo->lo_backing_file->f_flags & O_DIRECT) && inode->i_sb->s_bdev) - /* In case of direct I/O, match underlying block size */ - bsize = bdev_logical_block_size(inode->i_sb->s_bdev); - else - bsize = 512; - - blk_queue_logical_block_size(lo->lo_queue, bsize); - blk_queue_physical_block_size(lo->lo_queue, bsize); - blk_queue_io_min(lo->lo_queue, bsize); + lim = queue_limits_start_update(lo->lo_queue); + loop_update_limits(lo, &lim, config->block_size); + /* No need to freeze the queue as the device isn't bound yet. */ + error = queue_limits_commit_update(lo->lo_queue, &lim); + if (error) + goto out_unlock; - loop_config_discard(lo); - loop_update_rotational(lo); loop_update_dio(lo); loop_sysfs_init(lo); @@ -1129,22 +1139,12 @@ out_putf: return error; } -static void __loop_clr_fd(struct loop_device *lo, bool release) +static void __loop_clr_fd(struct loop_device *lo) { + struct queue_limits lim; struct file *filp; gfp_t gfp = lo->old_gfp_mask; - if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags)) - blk_queue_write_cache(lo->lo_queue, false, false); - - /* - * Freeze the request queue when unbinding on a live file descriptor and - * thus an open device. When called from ->release we are guaranteed - * that there is no I/O in progress already. - */ - if (!release) - blk_mq_freeze_queue(lo->lo_queue); - spin_lock_irq(&lo->lo_lock); filp = lo->lo_backing_file; lo->lo_backing_file = NULL; @@ -1154,9 +1154,19 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) lo->lo_offset = 0; lo->lo_sizelimit = 0; memset(lo->lo_file_name, 0, LO_NAME_SIZE); - blk_queue_logical_block_size(lo->lo_queue, 512); - blk_queue_physical_block_size(lo->lo_queue, 512); - blk_queue_io_min(lo->lo_queue, 512); + + /* + * Reset the block size to the default. + * + * No queue freezing needed because this is called from the final + * ->release call only, so there can't be any outstanding I/O. + */ + lim = queue_limits_start_update(lo->lo_queue); + lim.logical_block_size = SECTOR_SIZE; + lim.physical_block_size = SECTOR_SIZE; + lim.io_min = SECTOR_SIZE; + queue_limits_commit_update(lo->lo_queue, &lim); + invalidate_disk(lo->lo_disk); loop_sysfs_exit(lo); /* let user-space know about this change */ @@ -1164,8 +1174,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) mapping_set_gfp_mask(filp->f_mapping, gfp); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - if (!release) - blk_mq_unfreeze_queue(lo->lo_queue); disk_force_media_change(lo->lo_disk); @@ -1180,11 +1188,7 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) * must be at least one and it can only become zero when the * current holder is released. */ - if (!release) - mutex_lock(&lo->lo_disk->open_mutex); err = bdev_disk_changed(lo->lo_disk, false); - if (!release) - mutex_unlock(&lo->lo_disk->open_mutex); if (err) pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", __func__, lo->lo_number, err); @@ -1233,24 +1237,16 @@ static int loop_clr_fd(struct loop_device *lo) return -ENXIO; } /* - * If we've explicitly asked to tear down the loop device, - * and it has an elevated reference count, set it for auto-teardown when - * the last reference goes away. This stops $!~#$@ udev from - * preventing teardown because it decided that it needs to run blkid on - * the loopback device whenever they appear. xfstests is notorious for - * failing tests because blkid via udev races with a losetup - * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d - * command to fail with EBUSY. + * Mark the device for removing the backing device on last close. + * If we are the only opener, also switch the state to roundown here to + * prevent new openers from coming in. */ - if (disk_openers(lo->lo_disk) > 1) { - lo->lo_flags |= LO_FLAGS_AUTOCLEAR; - loop_global_unlock(lo, true); - return 0; - } - lo->lo_state = Lo_rundown; + + lo->lo_flags |= LO_FLAGS_AUTOCLEAR; + if (disk_openers(lo->lo_disk) == 1) + lo->lo_state = Lo_rundown; loop_global_unlock(lo, true); - __loop_clr_fd(lo, false); return 0; } @@ -1258,9 +1254,9 @@ static int loop_set_status(struct loop_device *lo, const struct loop_info64 *info) { int err; - int prev_lo_flags; bool partscan = false; bool size_changed = false; + unsigned int memflags; err = mutex_lock_killable(&lo->lo_mutex); if (err) @@ -1277,21 +1273,18 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) invalidate_bdev(lo->lo_device); } - /* I/O need to be drained during transfer transition */ - blk_mq_freeze_queue(lo->lo_queue); - - prev_lo_flags = lo->lo_flags; + /* I/O needs to be drained before changing lo_offset or lo_sizelimit */ + memflags = blk_mq_freeze_queue(lo->lo_queue); err = loop_set_status_from_info(lo, info); if (err) goto out_unfreeze; - /* Mask out flags that can't be set using LOOP_SET_STATUS. */ - lo->lo_flags &= LOOP_SET_STATUS_SETTABLE_FLAGS; - /* For those flags, use the previous values instead */ - lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_SETTABLE_FLAGS; - /* For flags that can't be cleared, use previous values too */ - lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_CLEARABLE_FLAGS; + partscan = !(lo->lo_flags & LO_FLAGS_PARTSCAN) && + (info->lo_flags & LO_FLAGS_PARTSCAN); + + lo->lo_flags &= ~LOOP_SET_STATUS_CLEARABLE_FLAGS; + lo->lo_flags |= (info->lo_flags & LOOP_SET_STATUS_SETTABLE_FLAGS); if (size_changed) { loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit, @@ -1299,17 +1292,13 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) loop_set_size(lo, new_size); } - /* update dio if lo_offset or transfer is changed */ - __loop_update_dio(lo, lo->use_dio); + /* update the direct I/O flag if lo_offset changed */ + loop_update_dio(lo); out_unfreeze: - blk_mq_unfreeze_queue(lo->lo_queue); - - if (!err && (lo->lo_flags & LO_FLAGS_PARTSCAN) && - !(prev_lo_flags & LO_FLAGS_PARTSCAN)) { + blk_mq_unfreeze_queue(lo->lo_queue, memflags); + if (partscan) clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); - partscan = true; - } out_unlock: mutex_unlock(&lo->lo_mutex); if (partscan) @@ -1458,41 +1447,52 @@ static int loop_set_capacity(struct loop_device *lo) static int loop_set_dio(struct loop_device *lo, unsigned long arg) { - int error = -ENXIO; - if (lo->lo_state != Lo_bound) - goto out; + bool use_dio = !!arg; + unsigned int memflags; - __loop_update_dio(lo, !!arg); - if (lo->use_dio == !!arg) + if (lo->lo_state != Lo_bound) + return -ENXIO; + if (use_dio == !!(lo->lo_flags & LO_FLAGS_DIRECT_IO)) return 0; - error = -EINVAL; - out: - return error; + + if (use_dio) { + if (!lo_can_use_dio(lo)) + return -EINVAL; + /* flush dirty pages before starting to use direct I/O */ + vfs_fsync(lo->lo_backing_file, 0); + } + + memflags = blk_mq_freeze_queue(lo->lo_queue); + if (use_dio) + lo->lo_flags |= LO_FLAGS_DIRECT_IO; + else + lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; + blk_mq_unfreeze_queue(lo->lo_queue, memflags); + return 0; } static int loop_set_block_size(struct loop_device *lo, unsigned long arg) { + struct queue_limits lim; + unsigned int memflags; int err = 0; if (lo->lo_state != Lo_bound) return -ENXIO; - err = blk_validate_block_size(arg); - if (err) - return err; - if (lo->lo_queue->limits.logical_block_size == arg) return 0; sync_blockdev(lo->lo_device); invalidate_bdev(lo->lo_device); - blk_mq_freeze_queue(lo->lo_queue); - blk_queue_logical_block_size(lo->lo_queue, arg); - blk_queue_physical_block_size(lo->lo_queue, arg); - blk_queue_io_min(lo->lo_queue, arg); + lim = queue_limits_start_update(lo->lo_queue); + loop_update_limits(lo, &lim, arg); + + memflags = blk_mq_freeze_queue(lo->lo_queue); + err = queue_limits_commit_update(lo->lo_queue, &lim); loop_update_dio(lo); - blk_mq_unfreeze_queue(lo->lo_queue); + blk_mq_unfreeze_queue(lo->lo_queue, memflags); return err; } @@ -1719,25 +1719,43 @@ static int lo_compat_ioctl(struct block_device *bdev, blk_mode_t mode, } #endif +static int lo_open(struct gendisk *disk, blk_mode_t mode) +{ + struct loop_device *lo = disk->private_data; + int err; + + err = mutex_lock_killable(&lo->lo_mutex); + if (err) + return err; + + if (lo->lo_state == Lo_deleting || lo->lo_state == Lo_rundown) + err = -ENXIO; + mutex_unlock(&lo->lo_mutex); + return err; +} + static void lo_release(struct gendisk *disk) { struct loop_device *lo = disk->private_data; + bool need_clear = false; if (disk_openers(disk) > 0) return; + /* + * Clear the backing device information if this is the last close of + * a device that's been marked for auto clear, or on which LOOP_CLR_FD + * has been called. + */ mutex_lock(&lo->lo_mutex); - if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) { + if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) lo->lo_state = Lo_rundown; - mutex_unlock(&lo->lo_mutex); - /* - * In autoclear mode, stop the loop thread - * and remove configuration after last close. - */ - __loop_clr_fd(lo, true); - return; - } + + need_clear = (lo->lo_state == Lo_rundown); mutex_unlock(&lo->lo_mutex); + + if (need_clear) + __loop_clr_fd(lo); } static void lo_free_disk(struct gendisk *disk) @@ -1754,6 +1772,7 @@ static void lo_free_disk(struct gendisk *disk) static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, + .open = lo_open, .release = lo_release, .ioctl = lo_ioctl, #ifdef CONFIG_COMPAT @@ -1832,6 +1851,7 @@ static const struct kernel_param_ops loop_hw_qdepth_param_ops = { device_param_cb(hw_queue_depth, &loop_hw_qdepth_param_ops, &hw_queue_depth, 0444); MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: " __stringify(LOOP_DEFAULT_HW_Q_DEPTH)); +MODULE_DESCRIPTION("Loopback device support"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1854,7 +1874,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, cmd->use_aio = false; break; default: - cmd->use_aio = lo->use_dio; + cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO; break; } @@ -1982,6 +2002,12 @@ static const struct blk_mq_ops loop_mq_ops = { static int loop_add(int i) { + struct queue_limits lim = { + /* + * Random number picked from the historic block max_sectors cap. + */ + .max_hw_sectors = 2560u, + }; struct loop_device *lo; struct gendisk *disk; int err; @@ -2017,32 +2043,20 @@ static int loop_add(int i) lo->tag_set.queue_depth = hw_queue_depth; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING | - BLK_MQ_F_NO_SCHED_BY_DEFAULT; + lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); if (err) goto out_free_idr; - disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, lo); + disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, &lim, lo); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_cleanup_tags; } lo->lo_queue = lo->lo_disk->queue; - /* random number picked from the history block max_sectors cap */ - blk_queue_max_hw_sectors(lo->lo_queue, 2560u); - - /* - * By default, we do buffer IO, so it doesn't make sense to enable - * merge because the I/O submitted to backing file is handled page by - * page. For directio mode, merge does help to dispatch bigger request - * to underlayer disk. We will enable merge once directio is enabled. - */ - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue); - /* * Disable partition scanning by default. The in-kernel partition * scanning can be requested individually per-device during its diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index b200950e8fb5..95361099a2dc 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2259,35 +2259,20 @@ static const struct file_operations mtip_regs_fops = { .owner = THIS_MODULE, .open = simple_open, .read = mtip_hw_read_registers, - .llseek = no_llseek, }; static const struct file_operations mtip_flags_fops = { .owner = THIS_MODULE, .open = simple_open, .read = mtip_hw_read_flags, - .llseek = no_llseek, }; -static int mtip_hw_debugfs_init(struct driver_data *dd) +static void mtip_hw_debugfs_init(struct driver_data *dd) { - if (!dfs_parent) - return -1; - dd->dfs_node = debugfs_create_dir(dd->disk->disk_name, dfs_parent); - if (IS_ERR_OR_NULL(dd->dfs_node)) { - dev_warn(&dd->pdev->dev, - "Error creating node %s under debugfs\n", - dd->disk->disk_name); - dd->dfs_node = NULL; - return -1; - } - debugfs_create_file("flags", 0444, dd->dfs_node, dd, &mtip_flags_fops); debugfs_create_file("registers", 0444, dd->dfs_node, dd, &mtip_regs_fops); - - return 0; } static void mtip_hw_debugfs_exit(struct driver_data *dd) @@ -2716,7 +2701,12 @@ static int mtip_hw_init(struct driver_data *dd) int rv; unsigned long timeout, timetaken; - dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR]; + dd->mmio = pcim_iomap_region(dd->pdev, MTIP_ABAR, MTIP_DRV_NAME); + if (IS_ERR(dd->mmio)) { + dev_err(&dd->pdev->dev, "Unable to request / ioremap PCI region\n"); + return PTR_ERR(dd->mmio); + } + mtip_detect_product(dd); if (dd->product_type == MTIP_PRODUCT_UNKNOWN) { @@ -3401,6 +3391,12 @@ static const struct blk_mq_ops mtip_mq_ops = { */ static int mtip_block_initialize(struct driver_data *dd) { + struct queue_limits lim = { + .physical_block_size = 4096, + .max_hw_sectors = 0xffff, + .max_segments = MTIP_MAX_SG, + .max_segment_size = 0x400000, + }; int rv = 0, wait_for_rebuild = 0; sector_t capacity; unsigned int index = 0; @@ -3420,7 +3416,6 @@ static int mtip_block_initialize(struct driver_data *dd) dd->tags.reserved_tags = 1; dd->tags.cmd_size = sizeof(struct mtip_cmd); dd->tags.numa_node = dd->numa_node; - dd->tags.flags = BLK_MQ_F_SHOULD_MERGE; dd->tags.driver_data = dd; dd->tags.timeout = MTIP_NCQ_CMD_TIMEOUT_MS; @@ -3431,7 +3426,7 @@ static int mtip_block_initialize(struct driver_data *dd) goto block_queue_alloc_tag_error; } - dd->disk = blk_mq_alloc_disk(&dd->tags, dd); + dd->disk = blk_mq_alloc_disk(&dd->tags, &lim, dd); if (IS_ERR(dd->disk)) { dev_err(&dd->pdev->dev, "Unable to allocate request queue\n"); @@ -3479,14 +3474,7 @@ skip_create_disk: goto start_service_thread; /* Set device limits. */ - blk_queue_flag_set(QUEUE_FLAG_NONROT, dd->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dd->queue); - blk_queue_max_segments(dd->queue, MTIP_MAX_SG); - blk_queue_physical_block_size(dd->queue, 4096); - blk_queue_max_hw_sectors(dd->queue, 0xffff); - blk_queue_max_segment_size(dd->queue, 0x400000); dma_set_max_seg_size(&dd->pdev->dev, 0x400000); - blk_queue_io_min(dd->queue, 4096); /* Set the capacity of the device in 512 byte sectors. */ if (!(mtip_hw_get_capacity(dd, &capacity))) { @@ -3726,13 +3714,6 @@ static int mtip_pci_probe(struct pci_dev *pdev, goto iomap_err; } - /* Map BAR5 to memory. */ - rv = pcim_iomap_regions(pdev, 1 << MTIP_ABAR, MTIP_DRV_NAME); - if (rv < 0) { - dev_err(&pdev->dev, "Unable to map regions\n"); - goto iomap_err; - } - rv = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); if (rv) { dev_warn(&pdev->dev, "64-bit DMA enable failed\n"); @@ -4044,10 +4025,6 @@ static int __init mtip_init(void) mtip_major = error; dfs_parent = debugfs_create_dir("rssd", NULL); - if (IS_ERR_OR_NULL(dfs_parent)) { - pr_warn("Error creating debugfs parent\n"); - dfs_parent = NULL; - } /* Register our PCI operations. */ error = pci_register_driver(&mtip_pci_driver); diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index d914156db2d8..b9fdeff31caf 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -114,6 +114,10 @@ static const struct block_device_operations n64cart_fops = { */ static int __init n64cart_probe(struct platform_device *pdev) { + struct queue_limits lim = { + .physical_block_size = 4096, + .logical_block_size = 4096, + }; struct gendisk *disk; int err = -ENOMEM; @@ -131,9 +135,11 @@ static int __init n64cart_probe(struct platform_device *pdev) if (IS_ERR(reg_base)) return PTR_ERR(reg_base); - disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = blk_alloc_disk(&lim, NUMA_NO_NODE); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); goto out; + } disk->first_minor = 0; disk->flags = GENHD_FL_NO_PART; @@ -144,10 +150,6 @@ static int __init n64cart_probe(struct platform_device *pdev) set_capacity(disk, size >> SECTOR_SHIFT); set_disk_ro(disk, 1); - blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); - blk_queue_physical_block_size(disk->queue, 4096); - blk_queue_logical_block_size(disk->queue, 4096); - err = add_disk(disk); if (err) goto out_cleanup_disk; diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 33a8f37bb6a1..7bdc7eb808ea 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -62,6 +62,7 @@ struct nbd_sock { bool dead; int fallback_index; int cookie; + struct work_struct work; }; struct recv_thread_args { @@ -141,6 +142,9 @@ struct nbd_device { */ #define NBD_CMD_INFLIGHT 2 +/* Just part of request header or data payload is sent successfully */ +#define NBD_CMD_PARTIAL_SEND 3 + struct nbd_cmd { struct nbd_device *nbd; struct mutex lock; @@ -181,6 +185,17 @@ static void nbd_requeue_cmd(struct nbd_cmd *cmd) { struct request *req = blk_mq_rq_from_pdu(cmd); + lockdep_assert_held(&cmd->lock); + + /* + * Clear INFLIGHT flag so that this cmd won't be completed in + * normal completion path + * + * INFLIGHT flag will be set when the cmd is queued to nbd next + * time. + */ + __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); + if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags)) blk_mq_requeue_request(req, true); } @@ -222,7 +237,7 @@ static ssize_t pid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); - struct nbd_device *nbd = (struct nbd_device *)disk->private_data; + struct nbd_device *nbd = disk->private_data; return sprintf(buf, "%d\n", nbd->pid); } @@ -236,7 +251,7 @@ static ssize_t backend_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); - struct nbd_device *nbd = (struct nbd_device *)disk->private_data; + struct nbd_device *nbd = disk->private_data; return sprintf(buf, "%s\n", nbd->backend ?: ""); } @@ -316,9 +331,11 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, nsock->sent = 0; } -static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, - loff_t blksize) +static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, loff_t blksize) { + struct queue_limits lim; + int error; + if (!blksize) blksize = 1u << NBD_DEF_BLKSIZE_BITS; @@ -334,10 +351,29 @@ static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, if (!nbd->pid) return 0; + lim = queue_limits_start_update(nbd->disk->queue); if (nbd->config->flags & NBD_FLAG_SEND_TRIM) - blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX); - blk_queue_logical_block_size(nbd->disk->queue, blksize); - blk_queue_physical_block_size(nbd->disk->queue, blksize); + lim.max_hw_discard_sectors = UINT_MAX >> SECTOR_SHIFT; + else + lim.max_hw_discard_sectors = 0; + if (!(nbd->config->flags & NBD_FLAG_SEND_FLUSH)) { + lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA); + } else if (nbd->config->flags & NBD_FLAG_SEND_FUA) { + lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; + } else { + lim.features |= BLK_FEAT_WRITE_CACHE; + lim.features &= ~BLK_FEAT_FUA; + } + if (nbd->config->flags & NBD_FLAG_ROTATIONAL) + lim.features |= BLK_FEAT_ROTATIONAL; + if (nbd->config->flags & NBD_FLAG_SEND_WRITE_ZEROES) + lim.max_write_zeroes_sectors = UINT_MAX >> SECTOR_SHIFT; + + lim.logical_block_size = blksize; + lim.physical_block_size = blksize; + error = queue_limits_commit_update_frozen(nbd->disk->queue, &lim); + if (error) + return error; if (max_part) set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); @@ -389,6 +425,8 @@ static u32 req_to_nbd_cmd_type(struct request *req) return NBD_CMD_WRITE; case REQ_OP_READ: return NBD_CMD_READ; + case REQ_OP_WRITE_ZEROES: + return NBD_CMD_WRITE_ZEROES; default: return U32_MAX; } @@ -419,6 +457,12 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req) if (!mutex_trylock(&cmd->lock)) return BLK_EH_RESET_TIMER; + /* partial send is handled in nbd_sock's work function */ + if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)) { + mutex_unlock(&cmd->lock); + return BLK_EH_RESET_TIMER; + } + if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { mutex_unlock(&cmd->lock); return BLK_EH_DONE; @@ -459,8 +503,8 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req) nbd_mark_nsock_dead(nbd, nsock, 1); mutex_unlock(&nsock->tx_lock); } - mutex_unlock(&cmd->lock); nbd_requeue_cmd(cmd); + mutex_unlock(&cmd->lock); nbd_config_put(nbd); return BLK_EH_DONE; } @@ -567,8 +611,36 @@ static inline int was_interrupted(int result) return result == -ERESTARTSYS || result == -EINTR; } -/* always call with the tx_lock held */ -static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) +/* + * We've already sent header or part of data payload, have no choice but + * to set pending and schedule it in work. + * + * And we have to return BLK_STS_OK to block core, otherwise this same + * request may be re-dispatched with different tag, but our header has + * been sent out with old tag, and this way does confuse reply handling. + */ +static void nbd_sched_pending_work(struct nbd_device *nbd, + struct nbd_sock *nsock, + struct nbd_cmd *cmd, int sent) +{ + struct request *req = blk_mq_rq_from_pdu(cmd); + + /* pending work should be scheduled only once */ + WARN_ON_ONCE(test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)); + + nsock->pending = req; + nsock->sent = sent; + set_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags); + refcount_inc(&nbd->config_refs); + schedule_work(&nsock->work); +} + +/* + * Returns BLK_STS_RESOURCE if the caller should retry after a delay. + * Returns BLK_STS_IOERR if sending failed. + */ +static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, + int index) { struct request *req = blk_mq_rq_from_pdu(cmd); struct nbd_config *config = nbd->config; @@ -577,28 +649,32 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)}; struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)}; struct iov_iter from; - unsigned long size = blk_rq_bytes(req); struct bio *bio; u64 handle; u32 type; u32 nbd_cmd_flags = 0; int sent = nsock->sent, skip = 0; + lockdep_assert_held(&cmd->lock); + lockdep_assert_held(&nsock->tx_lock); + iov_iter_kvec(&from, ITER_SOURCE, &iov, 1, sizeof(request)); type = req_to_nbd_cmd_type(req); if (type == U32_MAX) - return -EIO; + return BLK_STS_IOERR; if (rq_data_dir(req) == WRITE && (config->flags & NBD_FLAG_READ_ONLY)) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Write on read-only\n"); - return -EIO; + return BLK_STS_IOERR; } if (req->cmd_flags & REQ_FUA) nbd_cmd_flags |= NBD_CMD_FLAG_FUA; + if ((req->cmd_flags & REQ_NOUNMAP) && (type == NBD_CMD_WRITE_ZEROES)) + nbd_cmd_flags |= NBD_CMD_FLAG_NO_HOLE; /* We did a partial send previously, and we at least sent the whole * request struct, so just go and send the rest of the pages in the @@ -623,7 +699,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) request.type = htonl(type | nbd_cmd_flags); if (type != NBD_CMD_FLUSH) { request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); - request.len = htonl(size); + request.len = htonl(blk_rq_bytes(req)); } handle = nbd_cmd_handle(cmd); request.cookie = cpu_to_be64(handle); @@ -644,15 +720,15 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) * completely done. */ if (sent) { - nsock->pending = req; - nsock->sent = sent; + nbd_sched_pending_work(nbd, nsock, cmd, sent); + return BLK_STS_OK; } set_bit(NBD_CMD_REQUEUED, &cmd->flags); return BLK_STS_RESOURCE; } dev_err_ratelimited(disk_to_dev(nbd->disk), "Send control failed (result %d)\n", result); - return -EAGAIN; + goto requeue; } send_pages: if (type != NBD_CMD_WRITE) @@ -682,19 +758,13 @@ send_pages: result = sock_xmit(nbd, index, 1, &from, flags, &sent); if (result < 0) { if (was_interrupted(result)) { - /* We've already sent the header, we - * have no choice but to set pending and - * return BUSY. - */ - nsock->pending = req; - nsock->sent = sent; - set_bit(NBD_CMD_REQUEUED, &cmd->flags); - return BLK_STS_RESOURCE; + nbd_sched_pending_work(nbd, nsock, cmd, sent); + return BLK_STS_OK; } dev_err(disk_to_dev(nbd->disk), "Send data failed (result %d)\n", result); - return -EAGAIN; + goto requeue; } /* * The completion might already have come in, @@ -711,7 +781,62 @@ out: trace_nbd_payload_sent(req, handle); nsock->pending = NULL; nsock->sent = 0; - return 0; + __set_bit(NBD_CMD_INFLIGHT, &cmd->flags); + return BLK_STS_OK; + +requeue: + /* + * Can't requeue in case we are dealing with partial send + * + * We must run from pending work function. + * */ + if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)) + return BLK_STS_OK; + + /* retry on a different socket */ + dev_err_ratelimited(disk_to_dev(nbd->disk), + "Request send failed, requeueing\n"); + nbd_mark_nsock_dead(nbd, nsock, 1); + nbd_requeue_cmd(cmd); + return BLK_STS_OK; +} + +/* handle partial sending */ +static void nbd_pending_cmd_work(struct work_struct *work) +{ + struct nbd_sock *nsock = container_of(work, struct nbd_sock, work); + struct request *req = nsock->pending; + struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); + struct nbd_device *nbd = cmd->nbd; + unsigned long deadline = READ_ONCE(req->deadline); + unsigned int wait_ms = 2; + + mutex_lock(&cmd->lock); + + WARN_ON_ONCE(test_bit(NBD_CMD_REQUEUED, &cmd->flags)); + if (WARN_ON_ONCE(!test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags))) + goto out; + + mutex_lock(&nsock->tx_lock); + while (true) { + nbd_send_cmd(nbd, cmd, cmd->index); + if (!nsock->pending) + break; + + /* don't bother timeout handler for partial sending */ + if (READ_ONCE(jiffies) + msecs_to_jiffies(wait_ms) >= deadline) { + cmd->status = BLK_STS_IOERR; + blk_mq_complete_request(req); + break; + } + msleep(wait_ms); + wait_ms *= 2; + } + mutex_unlock(&nsock->tx_lock); + clear_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags); +out: + mutex_unlock(&cmd->lock); + nbd_config_put(nbd); } static int nbd_read_reply(struct nbd_device *nbd, struct socket *sock, @@ -986,26 +1111,28 @@ static int wait_for_reconnect(struct nbd_device *nbd) return !test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags); } -static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) +static blk_status_t nbd_handle_cmd(struct nbd_cmd *cmd, int index) { struct request *req = blk_mq_rq_from_pdu(cmd); struct nbd_device *nbd = cmd->nbd; struct nbd_config *config; struct nbd_sock *nsock; - int ret; + blk_status_t ret; + + lockdep_assert_held(&cmd->lock); config = nbd_get_config_unlocked(nbd); if (!config) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Socks array is empty\n"); - return -EINVAL; + return BLK_STS_IOERR; } if (index >= config->num_connections) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Attempted send on invalid socket\n"); nbd_config_put(nbd); - return -EINVAL; + return BLK_STS_IOERR; } cmd->status = BLK_STS_OK; again: @@ -1028,7 +1155,7 @@ again: */ sock_shutdown(nbd); nbd_config_put(nbd); - return -EIO; + return BLK_STS_IOERR; } goto again; } @@ -1041,27 +1168,10 @@ again: blk_mq_start_request(req); if (unlikely(nsock->pending && nsock->pending != req)) { nbd_requeue_cmd(cmd); - ret = 0; + ret = BLK_STS_OK; goto out; } - /* - * Some failures are related to the link going down, so anything that - * returns EAGAIN can be retried on a different socket. - */ ret = nbd_send_cmd(nbd, cmd, index); - /* - * Access to this flag is protected by cmd->lock, thus it's safe to set - * the flag after nbd_send_cmd() succeed to send request to server. - */ - if (!ret) - __set_bit(NBD_CMD_INFLIGHT, &cmd->flags); - else if (ret == -EAGAIN) { - dev_err_ratelimited(disk_to_dev(nbd->disk), - "Request send failed, requeueing\n"); - nbd_mark_nsock_dead(nbd, nsock, 1); - nbd_requeue_cmd(cmd); - ret = 0; - } out: mutex_unlock(&nsock->tx_lock); nbd_config_put(nbd); @@ -1072,7 +1182,7 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); - int ret; + blk_status_t ret; /* * Since we look at the bio's to send the request over the network we @@ -1092,10 +1202,6 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx, * appropriate. */ ret = nbd_handle_cmd(cmd, hctx->queue_num); - if (ret < 0) - ret = BLK_STS_IOERR; - else if (!ret) - ret = BLK_STS_OK; mutex_unlock(&cmd->lock); return ret; @@ -1128,6 +1234,7 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, struct socket *sock; struct nbd_sock **socks; struct nbd_sock *nsock; + unsigned int memflags; int err; /* Arg will be cast to int, check it to avoid overflow */ @@ -1141,7 +1248,7 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, * We need to make sure we don't get any errant requests while we're * reallocating the ->socks array. */ - blk_mq_freeze_queue(nbd->disk->queue); + memflags = blk_mq_freeze_queue(nbd->disk->queue); if (!netlink && !nbd->task_setup && !test_bit(NBD_RT_BOUND, &config->runtime_flags)) @@ -1179,14 +1286,15 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, nsock->pending = NULL; nsock->sent = 0; nsock->cookie = 0; + INIT_WORK(&nsock->work, nbd_pending_cmd_work); socks[config->num_connections++] = nsock; atomic_inc(&config->live_connections); - blk_mq_unfreeze_queue(nbd->disk->queue); + blk_mq_unfreeze_queue(nbd->disk->queue, memflags); return 0; put_socket: - blk_mq_unfreeze_queue(nbd->disk->queue); + blk_mq_unfreeze_queue(nbd->disk->queue, memflags); sockfd_put(sock); return err; } @@ -1262,19 +1370,10 @@ static void nbd_bdev_reset(struct nbd_device *nbd) static void nbd_parse_flags(struct nbd_device *nbd) { - struct nbd_config *config = nbd->config; - if (config->flags & NBD_FLAG_READ_ONLY) + if (nbd->config->flags & NBD_FLAG_READ_ONLY) set_disk_ro(nbd->disk, true); else set_disk_ro(nbd->disk, false); - if (config->flags & NBD_FLAG_SEND_FLUSH) { - if (config->flags & NBD_FLAG_SEND_FUA) - blk_queue_write_cache(nbd->disk->queue, true, true); - else - blk_queue_write_cache(nbd->disk->queue, true, false); - } - else - blk_queue_write_cache(nbd->disk->queue, false, false); } static void send_disconnects(struct nbd_device *nbd) @@ -1351,7 +1450,6 @@ static void nbd_config_put(struct nbd_device *nbd) nbd->config = NULL; nbd->tag_set.timeout = 0; - blk_queue_max_discard_sectors(nbd->disk->queue, 0); mutex_unlock(&nbd->config_lock); nbd_put(nbd); @@ -1688,6 +1786,10 @@ static int nbd_dbg_flags_show(struct seq_file *s, void *unused) seq_puts(s, "NBD_FLAG_SEND_FUA\n"); if (flags & NBD_FLAG_SEND_TRIM) seq_puts(s, "NBD_FLAG_SEND_TRIM\n"); + if (flags & NBD_FLAG_SEND_WRITE_ZEROES) + seq_puts(s, "NBD_FLAG_SEND_WRITE_ZEROES\n"); + if (flags & NBD_FLAG_ROTATIONAL) + seq_puts(s, "NBD_FLAG_ROTATIONAL\n"); return 0; } @@ -1783,6 +1885,12 @@ static const struct blk_mq_ops nbd_mq_ops = { static struct nbd_device *nbd_dev_add(int index, unsigned int refs) { + struct queue_limits lim = { + .max_hw_sectors = 65536, + .io_opt = 256 << SECTOR_SHIFT, + .max_segments = USHRT_MAX, + .max_segment_size = UINT_MAX, + }; struct nbd_device *nbd; struct gendisk *disk; int err = -ENOMEM; @@ -1796,8 +1904,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) nbd->tag_set.queue_depth = 128; nbd->tag_set.numa_node = NUMA_NO_NODE; nbd->tag_set.cmd_size = sizeof(struct nbd_cmd); - nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | - BLK_MQ_F_BLOCKING; + nbd->tag_set.flags = BLK_MQ_F_BLOCKING; nbd->tag_set.driver_data = nbd; INIT_WORK(&nbd->remove_work, nbd_dev_remove_work); nbd->backend = NULL; @@ -1823,7 +1930,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) if (err < 0) goto out_free_tags; - disk = blk_mq_alloc_disk(&nbd->tag_set, NULL); + disk = blk_mq_alloc_disk(&nbd->tag_set, &lim, NULL); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_free_idr; @@ -1839,16 +1946,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) goto out_err_disk; } - /* - * Tell the block layer that we are not a rotational device - */ - blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); - blk_queue_max_discard_sectors(disk->queue, 0); - blk_queue_max_segment_size(disk->queue, UINT_MAX); - blk_queue_max_segments(disk->queue, USHRT_MAX); - blk_queue_max_hw_sectors(disk->queue, 65536); - disk->queue->limits.max_sectors = 256; - mutex_init(&nbd->config_lock); refcount_set(&nbd->config_refs, 0); /* @@ -2145,6 +2242,7 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd) flush_workqueue(nbd->recv_workq); nbd_clear_que(nbd); nbd->task_setup = NULL; + clear_bit(NBD_RT_BOUND, &nbd->config->runtime_flags); mutex_unlock(&nbd->config_lock); if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, @@ -2433,6 +2531,12 @@ static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info) } dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST); + if (!dev_list) { + nlmsg_free(reply); + ret = -EMSGSIZE; + goto out; + } + if (index == -1) { ret = idr_for_each(&nbd_index_idr, &status_cb, reply); if (ret) { diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 36755f263e8e..fdc7a0b2af10 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -77,7 +77,7 @@ enum { NULL_IRQ_TIMER = 2, }; -static bool g_virt_boundary = false; +static bool g_virt_boundary; module_param_named(virt_boundary, g_virt_boundary, bool, 0444); MODULE_PARM_DESC(virt_boundary, "Require a virtual boundary for the device. Default: False"); @@ -115,6 +115,18 @@ module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444); MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>"); #endif +/* + * Historic queue modes. + * + * These days nothing but NULL_Q_MQ is actually supported, but we keep it the + * enum for error reporting. + */ +enum { + NULL_Q_BIO = 0, + NULL_Q_RQ = 1, + NULL_Q_MQ = 2, +}; + static int g_queue_mode = NULL_Q_MQ; static int null_param_store_val(const char *str, int *val, int min, int max) @@ -165,8 +177,8 @@ static bool g_blocking; module_param_named(blocking, g_blocking, bool, 0444); MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); -static bool shared_tags; -module_param(shared_tags, bool, 0444); +static bool g_shared_tags; +module_param_named(shared_tags, g_shared_tags, bool, 0444); MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); static bool g_shared_tag_bitmap; @@ -213,6 +225,10 @@ static unsigned long g_cache_size; module_param_named(cache_size, g_cache_size, ulong, 0444); MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)"); +static bool g_fua = true; +module_param_named(fua, g_fua, bool, 0444); +MODULE_PARM_DESC(fua, "Enable/disable FUA support when cache_size is used. Default: true"); + static unsigned int g_mbps; module_param_named(mbps, g_mbps, uint, 0444); MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)"); @@ -241,6 +257,19 @@ static unsigned int g_zone_max_active; module_param_named(zone_max_active, g_zone_max_active, uint, 0444); MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); +static int g_zone_append_max_sectors = INT_MAX; +module_param_named(zone_append_max_sectors, g_zone_append_max_sectors, int, 0444); +MODULE_PARM_DESC(zone_append_max_sectors, + "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation"); + +static bool g_zone_full; +module_param_named(zone_full, g_zone_full, bool, S_IRUGO); +MODULE_PARM_DESC(zone_full, "Initialize the sequential write required zones of a zoned device to be full. Default: false"); + +static bool g_rotational; +module_param_named(rotational, g_rotational, bool, S_IRUGO); +MODULE_PARM_DESC(rotational, "Set the rotational feature for the device. Default: false"); + static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); @@ -392,13 +421,25 @@ static int nullb_update_nr_hw_queues(struct nullb_device *dev, static int nullb_apply_submit_queues(struct nullb_device *dev, unsigned int submit_queues) { - return nullb_update_nr_hw_queues(dev, submit_queues, dev->poll_queues); + int ret; + + mutex_lock(&lock); + ret = nullb_update_nr_hw_queues(dev, submit_queues, dev->poll_queues); + mutex_unlock(&lock); + + return ret; } static int nullb_apply_poll_queues(struct nullb_device *dev, unsigned int poll_queues) { - return nullb_update_nr_hw_queues(dev, dev->submit_queues, poll_queues); + int ret; + + mutex_lock(&lock); + ret = nullb_update_nr_hw_queues(dev, dev->submit_queues, poll_queues); + mutex_unlock(&lock); + + return ret; } NULLB_DEVICE_ATTR(size, ulong, NULL); @@ -424,9 +465,14 @@ NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); +NULLB_DEVICE_ATTR(zone_append_max_sectors, uint, NULL); +NULLB_DEVICE_ATTR(zone_full, bool, NULL); NULLB_DEVICE_ATTR(virt_boundary, bool, NULL); NULLB_DEVICE_ATTR(no_sched, bool, NULL); +NULLB_DEVICE_ATTR(shared_tags, bool, NULL); NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); +NULLB_DEVICE_ATTR(fua, bool, NULL); +NULLB_DEVICE_ATTR(rotational, bool, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -444,28 +490,32 @@ static ssize_t nullb_device_power_store(struct config_item *item, if (ret < 0) return ret; + ret = count; + mutex_lock(&lock); if (!dev->power && newp) { if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) - return count; + goto out; + ret = null_add_dev(dev); if (ret) { clear_bit(NULLB_DEV_FL_UP, &dev->flags); - return ret; + goto out; } set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); dev->power = newp; + ret = count; } else if (dev->power && !newp) { if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { - mutex_lock(&lock); dev->power = newp; null_del_dev(dev->nullb); - mutex_unlock(&lock); } clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); } - return count; +out: + mutex_unlock(&lock); + return ret; } CONFIGFS_ATTR(nullb_device_, power); @@ -567,11 +617,16 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_zone_nr_conv, &nullb_device_attr_zone_max_open, &nullb_device_attr_zone_max_active, + &nullb_device_attr_zone_append_max_sectors, &nullb_device_attr_zone_readonly, &nullb_device_attr_zone_offline, + &nullb_device_attr_zone_full, &nullb_device_attr_virt_boundary, &nullb_device_attr_no_sched, + &nullb_device_attr_shared_tags, &nullb_device_attr_shared_tag_bitmap, + &nullb_device_attr_fua, + &nullb_device_attr_rotational, NULL, }; @@ -650,13 +705,15 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { return snprintf(page, PAGE_SIZE, - "badblocks,blocking,blocksize,cache_size," + "badblocks,blocking,blocksize,cache_size,fua," "completion_nsec,discard,home_node,hw_queue_depth," "irqmode,max_sectors,mbps,memory_backed,no_sched," - "poll_queues,power,queue_mode,shared_tag_bitmap,size," - "submit_queues,use_per_node_hctx,virt_boundary,zoned," - "zone_capacity,zone_max_active,zone_max_open," - "zone_nr_conv,zone_offline,zone_readonly,zone_size\n"); + "poll_queues,power,queue_mode,shared_tag_bitmap," + "shared_tags,size,submit_queues,use_per_node_hctx," + "virt_boundary,zoned,zone_capacity,zone_max_active," + "zone_max_open,zone_nr_conv,zone_offline,zone_readonly," + "zone_size,zone_append_max_sectors,zone_full," + "rotational\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -736,9 +793,15 @@ static struct nullb_device *null_alloc_dev(void) dev->zone_nr_conv = g_zone_nr_conv; dev->zone_max_open = g_zone_max_open; dev->zone_max_active = g_zone_max_active; + dev->zone_append_max_sectors = g_zone_append_max_sectors; + dev->zone_full = g_zone_full; dev->virt_boundary = g_virt_boundary; dev->no_sched = g_no_sched; + dev->shared_tags = g_shared_tags; dev->shared_tag_bitmap = g_shared_tag_bitmap; + dev->fua = g_fua; + dev->rotational = g_rotational; + return dev; } @@ -752,98 +815,11 @@ static void null_free_dev(struct nullb_device *dev) kfree(dev); } -static void put_tag(struct nullb_queue *nq, unsigned int tag) -{ - clear_bit_unlock(tag, nq->tag_map); - - if (waitqueue_active(&nq->wait)) - wake_up(&nq->wait); -} - -static unsigned int get_tag(struct nullb_queue *nq) -{ - unsigned int tag; - - do { - tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); - if (tag >= nq->queue_depth) - return -1U; - } while (test_and_set_bit_lock(tag, nq->tag_map)); - - return tag; -} - -static void free_cmd(struct nullb_cmd *cmd) -{ - put_tag(cmd->nq, cmd->tag); -} - -static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer); - -static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) -{ - struct nullb_cmd *cmd; - unsigned int tag; - - tag = get_tag(nq); - if (tag != -1U) { - cmd = &nq->cmds[tag]; - cmd->tag = tag; - cmd->error = BLK_STS_OK; - cmd->nq = nq; - if (nq->dev->irqmode == NULL_IRQ_TIMER) { - hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - cmd->timer.function = null_cmd_timer_expired; - } - return cmd; - } - - return NULL; -} - -static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, struct bio *bio) -{ - struct nullb_cmd *cmd; - DEFINE_WAIT(wait); - - do { - /* - * This avoids multiple return statements, multiple calls to - * __alloc_cmd() and a fast path call to prepare_to_wait(). - */ - cmd = __alloc_cmd(nq); - if (cmd) { - cmd->bio = bio; - return cmd; - } - prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); - io_schedule(); - finish_wait(&nq->wait, &wait); - } while (1); -} - -static void end_cmd(struct nullb_cmd *cmd) -{ - int queue_mode = cmd->nq->dev->queue_mode; - - switch (queue_mode) { - case NULL_Q_MQ: - blk_mq_end_request(cmd->rq, cmd->error); - return; - case NULL_Q_BIO: - cmd->bio->bi_status = cmd->error; - bio_endio(cmd->bio); - break; - } - - free_cmd(cmd); -} - static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) { - end_cmd(container_of(timer, struct nullb_cmd, timer)); + struct nullb_cmd *cmd = container_of(timer, struct nullb_cmd, timer); + blk_mq_end_request(blk_mq_rq_from_pdu(cmd), cmd->error); return HRTIMER_NORESTART; } @@ -856,7 +832,9 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd) static void null_complete_rq(struct request *rq) { - end_cmd(blk_mq_rq_to_pdu(rq)); + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); + + blk_mq_end_request(rq, cmd->error); } static struct nullb_page *null_alloc_page(void) @@ -929,7 +907,7 @@ static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, if (radix_tree_insert(root, idx, t_page)) { null_free_page(t_page); t_page = radix_tree_lookup(root, idx); - WARN_ON(!t_page || t_page->page->index != idx); + WARN_ON(!t_page || t_page->page->private != idx); } else if (is_cache) nullb->dev->curr_cache += PAGE_SIZE; @@ -952,7 +930,7 @@ static void null_free_device_storage(struct nullb_device *dev, bool is_cache) (void **)t_pages, pos, FREE_BATCH); for (i = 0; i < nr_pages; i++) { - pos = t_pages[i]->page->index; + pos = t_pages[i]->page->private; ret = radix_tree_delete_item(root, pos, t_pages[i]); WARN_ON(ret != t_pages[i]); null_free_page(ret); @@ -978,7 +956,7 @@ static struct nullb_page *__null_lookup_page(struct nullb *nullb, root = is_cache ? &nullb->dev->cache : &nullb->dev->data; t_page = radix_tree_lookup(root, idx); - WARN_ON(t_page && t_page->page->index != idx); + WARN_ON(t_page && t_page->page->private != idx); if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap))) return t_page; @@ -1021,7 +999,7 @@ static struct nullb_page *null_insert_page(struct nullb *nullb, spin_lock_irq(&nullb->lock); idx = sector >> PAGE_SECTORS_SHIFT; - t_page->page->index = idx; + t_page->page->private = idx; t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache); radix_tree_preload_end(); @@ -1041,7 +1019,7 @@ static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) struct nullb_page *t_page, *ret; void *dst, *src; - idx = c_page->page->index; + idx = c_page->page->private; t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true); @@ -1100,7 +1078,7 @@ again: * avoid race, we don't allow page free */ for (i = 0; i < nr_pages; i++) { - nullb->cache_flush_pos = c_pages[i]->page->index; + nullb->cache_flush_pos = c_pages[i]->page->private; /* * We found the page which is being flushed to disk by other * threads @@ -1220,7 +1198,7 @@ blk_status_t null_handle_discard(struct nullb_device *dev, return BLK_STS_OK; } -static int null_handle_flush(struct nullb *nullb) +static blk_status_t null_handle_flush(struct nullb *nullb) { int err; @@ -1237,7 +1215,7 @@ static int null_handle_flush(struct nullb *nullb) WARN_ON(!radix_tree_empty(&nullb->dev->cache)); spin_unlock_irq(&nullb->lock); - return err; + return errno_to_blk_status(err); } static int null_transfer(struct nullb *nullb, struct page *page, @@ -1271,11 +1249,11 @@ static int null_transfer(struct nullb *nullb, struct page *page, return err; } -static int null_handle_rq(struct nullb_cmd *cmd) +static blk_status_t null_handle_rq(struct nullb_cmd *cmd) { - struct request *rq = cmd->rq; + struct request *rq = blk_mq_rq_from_pdu(cmd); struct nullb *nullb = cmd->nq->dev->nullb; - int err; + int err = 0; unsigned int len; sector_t sector = blk_rq_pos(rq); struct req_iterator iter; @@ -1287,57 +1265,13 @@ static int null_handle_rq(struct nullb_cmd *cmd) err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, op_is_write(req_op(rq)), sector, rq->cmd_flags & REQ_FUA); - if (err) { - spin_unlock_irq(&nullb->lock); - return err; - } - sector += len >> SECTOR_SHIFT; - } - spin_unlock_irq(&nullb->lock); - - return 0; -} - -static int null_handle_bio(struct nullb_cmd *cmd) -{ - struct bio *bio = cmd->bio; - struct nullb *nullb = cmd->nq->dev->nullb; - int err; - unsigned int len; - sector_t sector = bio->bi_iter.bi_sector; - struct bio_vec bvec; - struct bvec_iter iter; - - spin_lock_irq(&nullb->lock); - bio_for_each_segment(bvec, bio, iter) { - len = bvec.bv_len; - err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, - op_is_write(bio_op(bio)), sector, - bio->bi_opf & REQ_FUA); - if (err) { - spin_unlock_irq(&nullb->lock); - return err; - } + if (err) + break; sector += len >> SECTOR_SHIFT; } spin_unlock_irq(&nullb->lock); - return 0; -} - -static void null_stop_queue(struct nullb *nullb) -{ - struct request_queue *q = nullb->q; - - if (nullb->dev->queue_mode == NULL_Q_MQ) - blk_mq_stop_hw_queues(q); -} - -static void null_restart_queue_async(struct nullb *nullb) -{ - struct request_queue *q = nullb->q; - if (nullb->dev->queue_mode == NULL_Q_MQ) - blk_mq_start_stopped_hw_queues(q, true); + return errno_to_blk_status(err); } static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) @@ -1345,16 +1279,16 @@ static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) struct nullb_device *dev = cmd->nq->dev; struct nullb *nullb = dev->nullb; blk_status_t sts = BLK_STS_OK; - struct request *rq = cmd->rq; + struct request *rq = blk_mq_rq_from_pdu(cmd); if (!hrtimer_active(&nullb->bw_timer)) hrtimer_restart(&nullb->bw_timer); if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) { - null_stop_queue(nullb); + blk_mq_stop_hw_queues(nullb->q); /* race with timer */ if (atomic_long_read(&nullb->cur_bytes) > 0) - null_restart_queue_async(nullb); + blk_mq_start_stopped_hw_queues(nullb->q, true); /* requeue request */ sts = BLK_STS_DEV_RESOURCE; } @@ -1381,37 +1315,29 @@ static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, sector_t nr_sectors) { struct nullb_device *dev = cmd->nq->dev; - int err; if (op == REQ_OP_DISCARD) return null_handle_discard(dev, sector, nr_sectors); - if (dev->queue_mode == NULL_Q_BIO) - err = null_handle_bio(cmd); - else - err = null_handle_rq(cmd); - - return errno_to_blk_status(err); + return null_handle_rq(cmd); } static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) { + struct request *rq = blk_mq_rq_from_pdu(cmd); struct nullb_device *dev = cmd->nq->dev; struct bio *bio; - if (dev->memory_backed) - return; - - if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) { - zero_fill_bio(cmd->bio); - } else if (req_op(cmd->rq) == REQ_OP_READ) { - __rq_for_each_bio(bio, cmd->rq) + if (!dev->memory_backed && req_op(rq) == REQ_OP_READ) { + __rq_for_each_bio(bio, rq) zero_fill_bio(bio); } } static inline void nullb_complete_cmd(struct nullb_cmd *cmd) { + struct request *rq = blk_mq_rq_from_pdu(cmd); + /* * Since root privileges are required to configure the null_blk * driver, it is fine that this driver does not initialize the @@ -1425,20 +1351,10 @@ static inline void nullb_complete_cmd(struct nullb_cmd *cmd) /* Complete IO by inline, softirq or timer */ switch (cmd->nq->dev->irqmode) { case NULL_IRQ_SOFTIRQ: - switch (cmd->nq->dev->queue_mode) { - case NULL_Q_MQ: - blk_mq_complete_request(cmd->rq); - break; - case NULL_Q_BIO: - /* - * XXX: no proper submitting cpu information available. - */ - end_cmd(cmd); - break; - } + blk_mq_complete_request(rq); break; case NULL_IRQ_NONE: - end_cmd(cmd); + blk_mq_end_request(rq, cmd->error); break; case NULL_IRQ_TIMER: null_cmd_end_timer(cmd); @@ -1472,7 +1388,7 @@ static void null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, blk_status_t sts; if (op == REQ_OP_FLUSH) { - cmd->error = errno_to_blk_status(null_handle_flush(nullb)); + cmd->error = null_handle_flush(nullb); goto out; } @@ -1499,7 +1415,7 @@ static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); - null_restart_queue_async(nullb); + blk_mq_start_stopped_hw_queues(nullb->q, true); hrtimer_forward_now(&nullb->bw_timer, timer_interval); @@ -1516,26 +1432,6 @@ static void nullb_setup_bwtimer(struct nullb *nullb) hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); } -static struct nullb_queue *nullb_to_queue(struct nullb *nullb) -{ - int index = 0; - - if (nullb->nr_queues != 1) - index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues); - - return &nullb->queues[index]; -} - -static void null_submit_bio(struct bio *bio) -{ - sector_t sector = bio->bi_iter.bi_sector; - sector_t nr_sectors = bio_sectors(bio); - struct nullb *nullb = bio->bi_bdev->bd_disk->private_data; - struct nullb_queue *nq = nullb_to_queue(nullb); - - null_handle_cmd(alloc_cmd(nq, bio), sector, nr_sectors, bio_op(bio)); -} - #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION static bool should_timeout_request(struct request *rq) @@ -1653,9 +1549,9 @@ static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) cmd = blk_mq_rq_to_pdu(req); cmd->error = null_process_cmd(cmd, req_op(req), blk_rq_pos(req), blk_rq_sectors(req)); - if (!blk_mq_add_to_batch(req, iob, (__force int) cmd->error, - blk_mq_end_request_batch)) - end_cmd(cmd); + if (!blk_mq_add_to_batch(req, iob, cmd->error != BLK_STS_OK, + blk_mq_end_request_batch)) + blk_mq_end_request(req, cmd->error); nr++; } @@ -1711,7 +1607,6 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cmd->timer.function = null_cmd_timer_expired; } - cmd->rq = rq; cmd->error = BLK_STS_OK; cmd->nq = nq; cmd->fake_timeout = should_timeout_request(rq) || @@ -1751,10 +1646,9 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } -static void null_queue_rqs(struct request **rqlist) +static void null_queue_rqs(struct rq_list *rqlist) { - struct request *requeue_list = NULL; - struct request **requeue_lastp = &requeue_list; + struct rq_list requeue_list = {}; struct blk_mq_queue_data bd = { }; blk_status_t ret; @@ -1764,40 +1658,14 @@ static void null_queue_rqs(struct request **rqlist) bd.rq = rq; ret = null_queue_rq(rq->mq_hctx, &bd); if (ret != BLK_STS_OK) - rq_list_add_tail(&requeue_lastp, rq); - } while (!rq_list_empty(*rqlist)); + rq_list_add_tail(&requeue_list, rq); + } while (!rq_list_empty(rqlist)); *rqlist = requeue_list; } -static void cleanup_queue(struct nullb_queue *nq) -{ - bitmap_free(nq->tag_map); - kfree(nq->cmds); -} - -static void cleanup_queues(struct nullb *nullb) -{ - int i; - - for (i = 0; i < nullb->nr_queues; i++) - cleanup_queue(&nullb->queues[i]); - - kfree(nullb->queues); -} - -static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) -{ - struct nullb_queue *nq = hctx->driver_data; - struct nullb *nullb = nq->dev->nullb; - - nullb->nr_queues--; -} - static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) { - init_waitqueue_head(&nq->wait); - nq->queue_depth = nullb->queue_depth; nq->dev = nullb->dev; INIT_LIST_HEAD(&nq->poll_list); spin_lock_init(&nq->poll_lock); @@ -1815,7 +1683,6 @@ static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, nq = &nullb->queues[hctx_idx]; hctx->driver_data = nq; null_init_queue(nullb, nq); - nullb->nr_queues++; return 0; } @@ -1828,7 +1695,6 @@ static const struct blk_mq_ops null_mq_ops = { .poll = null_poll, .map_queues = null_map_queues, .init_hctx = null_init_hctx, - .exit_hctx = null_exit_hctx, }; static void null_del_dev(struct nullb *nullb) @@ -1849,21 +1715,20 @@ static void null_del_dev(struct nullb *nullb) if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { hrtimer_cancel(&nullb->bw_timer); atomic_long_set(&nullb->cur_bytes, LONG_MAX); - null_restart_queue_async(nullb); + blk_mq_start_stopped_hw_queues(nullb->q, true); } put_disk(nullb->disk); - if (dev->queue_mode == NULL_Q_MQ && - nullb->tag_set == &nullb->__tag_set) + if (nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); - cleanup_queues(nullb); + kfree(nullb->queues); if (null_cache_active(nullb)) null_free_device_storage(nullb->dev, true); kfree(nullb); dev->nullb = NULL; } -static void null_config_discard(struct nullb *nullb) +static void null_config_discard(struct nullb *nullb, struct queue_limits *lim) { if (nullb->dev->discard == false) return; @@ -1880,43 +1745,14 @@ static void null_config_discard(struct nullb *nullb) return; } - blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); + lim->max_hw_discard_sectors = UINT_MAX >> 9; } -static const struct block_device_operations null_bio_ops = { - .owner = THIS_MODULE, - .submit_bio = null_submit_bio, - .report_zones = null_report_zones, -}; - -static const struct block_device_operations null_rq_ops = { +static const struct block_device_operations null_ops = { .owner = THIS_MODULE, .report_zones = null_report_zones, }; -static int setup_commands(struct nullb_queue *nq) -{ - struct nullb_cmd *cmd; - int i; - - nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL); - if (!nq->cmds) - return -ENOMEM; - - nq->tag_map = bitmap_zalloc(nq->queue_depth, GFP_KERNEL); - if (!nq->tag_map) { - kfree(nq->cmds); - return -ENOMEM; - } - - for (i = 0; i < nq->queue_depth; i++) { - cmd = &nq->cmds[i]; - cmd->tag = -1U; - } - - return 0; -} - static int setup_queues(struct nullb *nullb) { int nqueues = nr_cpu_ids; @@ -1929,101 +1765,64 @@ static int setup_queues(struct nullb *nullb) if (!nullb->queues) return -ENOMEM; - nullb->queue_depth = nullb->dev->hw_queue_depth; return 0; } -static int init_driver_queues(struct nullb *nullb) +static int null_init_tag_set(struct blk_mq_tag_set *set, int poll_queues) { - struct nullb_queue *nq; - int i, ret = 0; - - for (i = 0; i < nullb->dev->submit_queues; i++) { - nq = &nullb->queues[i]; - - null_init_queue(nullb, nq); - - ret = setup_commands(nq); - if (ret) - return ret; - nullb->nr_queues++; + set->ops = &null_mq_ops; + set->cmd_size = sizeof(struct nullb_cmd); + set->timeout = 5 * HZ; + set->nr_maps = 1; + if (poll_queues) { + set->nr_hw_queues += poll_queues; + set->nr_maps += 2; } - return 0; + return blk_mq_alloc_tag_set(set); } -static int null_gendisk_register(struct nullb *nullb) +static int null_init_global_tag_set(void) { - sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT; - struct gendisk *disk = nullb->disk; + int error; - set_capacity(disk, size); - - disk->major = null_major; - disk->first_minor = nullb->index; - disk->minors = 1; - if (queue_is_mq(nullb->q)) - disk->fops = &null_rq_ops; - else - disk->fops = &null_bio_ops; - disk->private_data = nullb; - strscpy_pad(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); - - if (nullb->dev->zoned) { - int ret = null_register_zoned_dev(nullb); + if (tag_set.ops) + return 0; - if (ret) - return ret; - } + tag_set.nr_hw_queues = g_submit_queues; + tag_set.queue_depth = g_hw_queue_depth; + tag_set.numa_node = g_home_node; + if (g_no_sched) + tag_set.flags |= BLK_MQ_F_NO_SCHED_BY_DEFAULT; + if (g_shared_tag_bitmap) + tag_set.flags |= BLK_MQ_F_TAG_HCTX_SHARED; + if (g_blocking) + tag_set.flags |= BLK_MQ_F_BLOCKING; - return add_disk(disk); + error = null_init_tag_set(&tag_set, g_poll_queues); + if (error) + tag_set.ops = NULL; + return error; } -static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) +static int null_setup_tagset(struct nullb *nullb) { - unsigned int flags = BLK_MQ_F_SHOULD_MERGE; - int hw_queues, numa_node; - unsigned int queue_depth; - int poll_queues; - - if (nullb) { - hw_queues = nullb->dev->submit_queues; - poll_queues = nullb->dev->poll_queues; - queue_depth = nullb->dev->hw_queue_depth; - numa_node = nullb->dev->home_node; - if (nullb->dev->no_sched) - flags |= BLK_MQ_F_NO_SCHED; - if (nullb->dev->shared_tag_bitmap) - flags |= BLK_MQ_F_TAG_HCTX_SHARED; - if (nullb->dev->blocking) - flags |= BLK_MQ_F_BLOCKING; - } else { - hw_queues = g_submit_queues; - poll_queues = g_poll_queues; - queue_depth = g_hw_queue_depth; - numa_node = g_home_node; - if (g_no_sched) - flags |= BLK_MQ_F_NO_SCHED; - if (g_shared_tag_bitmap) - flags |= BLK_MQ_F_TAG_HCTX_SHARED; - if (g_blocking) - flags |= BLK_MQ_F_BLOCKING; + if (nullb->dev->shared_tags) { + nullb->tag_set = &tag_set; + return null_init_global_tag_set(); } - set->ops = &null_mq_ops; - set->cmd_size = sizeof(struct nullb_cmd); - set->flags = flags; - set->driver_data = nullb; - set->nr_hw_queues = hw_queues; - set->queue_depth = queue_depth; - set->numa_node = numa_node; - if (poll_queues) { - set->nr_hw_queues += poll_queues; - set->nr_maps = 3; - } else { - set->nr_maps = 1; - } - - return blk_mq_alloc_tag_set(set); + nullb->tag_set = &nullb->__tag_set; + nullb->tag_set->driver_data = nullb; + nullb->tag_set->nr_hw_queues = nullb->dev->submit_queues; + nullb->tag_set->queue_depth = nullb->dev->hw_queue_depth; + nullb->tag_set->numa_node = nullb->dev->home_node; + if (nullb->dev->no_sched) + nullb->tag_set->flags |= BLK_MQ_F_NO_SCHED_BY_DEFAULT; + if (nullb->dev->shared_tag_bitmap) + nullb->tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; + if (nullb->dev->blocking) + nullb->tag_set->flags |= BLK_MQ_F_BLOCKING; + return null_init_tag_set(nullb->tag_set, nullb->dev->poll_queues); } static int null_validate_conf(struct nullb_device *dev) @@ -2032,11 +1831,12 @@ static int null_validate_conf(struct nullb_device *dev) pr_err("legacy IO path is no longer available\n"); return -EINVAL; } + if (dev->queue_mode == NULL_Q_BIO) { + pr_err("BIO-based IO path is no longer available, using blk-mq instead.\n"); + dev->queue_mode = NULL_Q_MQ; + } - dev->blocksize = round_down(dev->blocksize, 512); - dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); - - if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { + if (dev->use_per_node_hctx) { if (dev->submit_queues != nr_online_nodes) dev->submit_queues = nr_online_nodes; } else if (dev->submit_queues > nr_cpu_ids) @@ -2048,8 +1848,6 @@ static int null_validate_conf(struct nullb_device *dev) if (dev->poll_queues > g_poll_queues) dev->poll_queues = g_poll_queues; dev->prev_poll_queues = dev->poll_queues; - - dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); /* Do memory allocation, so set blocking */ @@ -2060,9 +1858,6 @@ static int null_validate_conf(struct nullb_device *dev) dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, dev->cache_size); dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); - /* can not stop a queue */ - if (dev->queue_mode == NULL_Q_BIO) - dev->mbps = 0; if (dev->zoned && (!dev->zone_size || !is_power_of_2(dev->zone_size))) { @@ -2102,6 +1897,12 @@ static bool null_setup_fault(void) static int null_add_dev(struct nullb_device *dev) { + struct queue_limits lim = { + .logical_block_size = dev->blocksize, + .physical_block_size = dev->blocksize, + .max_hw_sectors = dev->max_sectors, + }; + struct nullb *nullb; int rv; @@ -2123,75 +1924,49 @@ static int null_add_dev(struct nullb_device *dev) if (rv) goto out_free_nullb; - if (dev->queue_mode == NULL_Q_MQ) { - if (shared_tags) { - nullb->tag_set = &tag_set; - rv = 0; - } else { - nullb->tag_set = &nullb->__tag_set; - rv = null_init_tag_set(nullb, nullb->tag_set); - } + rv = null_setup_tagset(nullb); + if (rv) + goto out_cleanup_queues; + if (dev->virt_boundary) + lim.virt_boundary_mask = PAGE_SIZE - 1; + null_config_discard(nullb, &lim); + if (dev->zoned) { + rv = null_init_zoned_dev(dev, &lim); if (rv) - goto out_cleanup_queues; - - nullb->tag_set->timeout = 5 * HZ; - nullb->disk = blk_mq_alloc_disk(nullb->tag_set, nullb); - if (IS_ERR(nullb->disk)) { - rv = PTR_ERR(nullb->disk); goto out_cleanup_tags; - } - nullb->q = nullb->disk->queue; - } else if (dev->queue_mode == NULL_Q_BIO) { - rv = -ENOMEM; - nullb->disk = blk_alloc_disk(nullb->dev->home_node); - if (!nullb->disk) - goto out_cleanup_queues; - - nullb->q = nullb->disk->queue; - rv = init_driver_queues(nullb); - if (rv) - goto out_cleanup_disk; - } - - if (dev->mbps) { - set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); - nullb_setup_bwtimer(nullb); } if (dev->cache_size > 0) { set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); - blk_queue_write_cache(nullb->q, true, true); + lim.features |= BLK_FEAT_WRITE_CACHE; + if (dev->fua) + lim.features |= BLK_FEAT_FUA; } - if (dev->zoned) { - rv = null_init_zoned_dev(dev, nullb->q); - if (rv) - goto out_cleanup_disk; + if (dev->rotational) + lim.features |= BLK_FEAT_ROTATIONAL; + + nullb->disk = blk_mq_alloc_disk(nullb->tag_set, &lim, nullb); + if (IS_ERR(nullb->disk)) { + rv = PTR_ERR(nullb->disk); + goto out_cleanup_zone; + } + nullb->q = nullb->disk->queue; + + if (dev->mbps) { + set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); + nullb_setup_bwtimer(nullb); } nullb->q->queuedata = nullb; - blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); - mutex_lock(&lock); rv = ida_alloc(&nullb_indexes, GFP_KERNEL); - if (rv < 0) { - mutex_unlock(&lock); - goto out_cleanup_zone; - } + if (rv < 0) + goto out_cleanup_disk; + nullb->index = rv; dev->index = rv; - mutex_unlock(&lock); - - blk_queue_logical_block_size(nullb->q, dev->blocksize); - blk_queue_physical_block_size(nullb->q, dev->blocksize); - if (dev->max_sectors) - blk_queue_max_hw_sectors(nullb->q, dev->max_sectors); - - if (dev->virt_boundary) - blk_queue_virt_boundary(nullb->q, PAGE_SIZE - 1); - - null_config_discard(nullb); if (config_item_name(&dev->group.cg_item)) { /* Use configfs dir name as the device name */ @@ -2201,13 +1976,26 @@ static int null_add_dev(struct nullb_device *dev) sprintf(nullb->disk_name, "nullb%d", nullb->index); } - rv = null_gendisk_register(nullb); + set_capacity(nullb->disk, + ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT); + nullb->disk->major = null_major; + nullb->disk->first_minor = nullb->index; + nullb->disk->minors = 1; + nullb->disk->fops = &null_ops; + nullb->disk->private_data = nullb; + strscpy_pad(nullb->disk->disk_name, nullb->disk_name, DISK_NAME_LEN); + + if (nullb->dev->zoned) { + rv = null_register_zoned_dev(nullb); + if (rv) + goto out_ida_free; + } + + rv = add_disk(nullb->disk); if (rv) goto out_ida_free; - mutex_lock(&lock); list_add_tail(&nullb->list, &nullb_list); - mutex_unlock(&lock); pr_info("disk %s created\n", nullb->disk_name); @@ -2215,15 +2003,15 @@ static int null_add_dev(struct nullb_device *dev) out_ida_free: ida_free(&nullb_indexes, nullb->index); -out_cleanup_zone: - null_free_zoned_dev(dev); out_cleanup_disk: put_disk(nullb->disk); +out_cleanup_zone: + null_free_zoned_dev(dev); out_cleanup_tags: - if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) + if (nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); out_cleanup_queues: - cleanup_queues(nullb); + kfree(nullb->queues); out_free_nullb: kfree(nullb); dev->nullb = NULL; @@ -2256,7 +2044,9 @@ static int null_create_dev(void) if (!dev) return -ENOMEM; + mutex_lock(&lock); ret = null_add_dev(dev); + mutex_unlock(&lock); if (ret) { null_free_dev(dev); return ret; @@ -2299,7 +2089,7 @@ static int __init null_init(void) return -EINVAL; } - if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { + if (g_use_per_node_hctx) { if (g_submit_queues != nr_online_nodes) { pr_warn("submit_queues param is set to %u.\n", nr_online_nodes); @@ -2311,18 +2101,12 @@ static int __init null_init(void) g_submit_queues = 1; } - if (g_queue_mode == NULL_Q_MQ && shared_tags) { - ret = null_init_tag_set(NULL, &tag_set); - if (ret) - return ret; - } - config_group_init(&nullb_subsys.su_group); mutex_init(&nullb_subsys.su_mutex); ret = configfs_register_subsystem(&nullb_subsys); if (ret) - goto err_tagset; + return ret; mutex_init(&lock); @@ -2349,9 +2133,6 @@ err_dev: unregister_blkdev(null_major, "nullb"); err_conf: configfs_unregister_subsystem(&nullb_subsys); -err_tagset: - if (g_queue_mode == NULL_Q_MQ && shared_tags) - blk_mq_free_tag_set(&tag_set); return ret; } @@ -2370,12 +2151,15 @@ static void __exit null_exit(void) } mutex_unlock(&lock); - if (g_queue_mode == NULL_Q_MQ && shared_tags) + if (tag_set.ops) blk_mq_free_tag_set(&tag_set); + + mutex_destroy(&lock); } module_init(null_init); module_exit(null_exit); MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>"); +MODULE_DESCRIPTION("multi queue aware block test driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 929f659dd255..6f9fe6171087 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -16,11 +16,6 @@ #include <linux/mutex.h> struct nullb_cmd { - union { - struct request *rq; - struct bio *bio; - }; - unsigned int tag; blk_status_t error; bool fake_timeout; struct nullb_queue *nq; @@ -28,16 +23,11 @@ struct nullb_cmd { }; struct nullb_queue { - unsigned long *tag_map; - wait_queue_head_t wait; - unsigned int queue_depth; struct nullb_device *dev; unsigned int requeue_selection; struct list_head poll_list; spinlock_t poll_lock; - - struct nullb_cmd *cmds; }; struct nullb_zone { @@ -60,13 +50,6 @@ struct nullb_zone { unsigned int capacity; }; -/* Queue modes */ -enum { - NULL_Q_BIO = 0, - NULL_Q_RQ = 1, - NULL_Q_MQ = 2, -}; - struct nullb_device { struct nullb *nullb; struct config_group group; @@ -99,6 +82,7 @@ struct nullb_device { unsigned int zone_nr_conv; /* number of conventional zones */ unsigned int zone_max_open; /* max number of open zones */ unsigned int zone_max_active; /* max number of active zones */ + unsigned int zone_append_max_sectors; /* Max sectors per zone append command */ unsigned int submit_queues; /* number of submission queues */ unsigned int prev_submit_queues; /* number of submission queues before change */ unsigned int poll_queues; /* number of IOPOLL submission queues */ @@ -117,9 +101,13 @@ struct nullb_device { bool memory_backed; /* if data is stored in memory */ bool discard; /* if support discard */ bool zoned; /* if device is zoned */ + bool zone_full; /* Initialize zones to be full */ bool virt_boundary; /* virtual boundary on/off for the device */ bool no_sched; /* no IO scheduler for the device */ + bool shared_tags; /* share tag set between devices for blk-mq */ bool shared_tag_bitmap; /* use hostwide shared tags */ + bool fua; /* Support FUA */ + bool rotational; /* Fake rotational device */ }; struct nullb { @@ -130,14 +118,12 @@ struct nullb { struct gendisk *disk; struct blk_mq_tag_set *tag_set; struct blk_mq_tag_set __tag_set; - unsigned int queue_depth; atomic_long_t cur_bytes; struct hrtimer bw_timer; unsigned long cache_flush_pos; spinlock_t lock; struct nullb_queue *queues; - unsigned int nr_queues; char disk_name[DISK_NAME_LEN]; }; @@ -147,7 +133,7 @@ blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op, sector_t sector, unsigned int nr_sectors); #ifdef CONFIG_BLK_DEV_ZONED -int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q); +int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim); int null_register_zoned_dev(struct nullb *nullb); void null_free_zoned_dev(struct nullb_device *dev); int null_report_zones(struct gendisk *disk, sector_t sector, @@ -160,7 +146,7 @@ ssize_t zone_cond_store(struct nullb_device *dev, const char *page, size_t count, enum blk_zone_cond cond); #else static inline int null_init_zoned_dev(struct nullb_device *dev, - struct request_queue *q) + struct queue_limits *lim) { pr_err("CONFIG_BLK_DEV_ZONED not enabled\n"); return -EINVAL; diff --git a/drivers/block/null_blk/trace.h b/drivers/block/null_blk/trace.h index 6b2b370e786f..82b8f6a5e5f0 100644 --- a/drivers/block/null_blk/trace.h +++ b/drivers/block/null_blk/trace.h @@ -36,15 +36,21 @@ TRACE_EVENT(nullb_zone_op, TP_ARGS(cmd, zone_no, zone_cond), TP_STRUCT__entry( __array(char, disk, DISK_NAME_LEN) - __field(enum req_op, op) + /* + * __field() uses is_signed_type(). is_signed_type() does not + * support bitwise types. Use __field_struct() instead because + * it does not use is_signed_type(). + */ + __field_struct(enum req_op, op) __field(unsigned int, zone_no) __field(unsigned int, zone_cond) ), TP_fast_assign( - __entry->op = req_op(cmd->rq); + __entry->op = req_op(blk_mq_rq_from_pdu(cmd)); __entry->zone_no = zone_no; __entry->zone_cond = zone_cond; - __assign_disk_name(__entry->disk, cmd->rq->q->disk); + __assign_disk_name(__entry->disk, + blk_mq_rq_from_pdu(cmd)->q->disk); ), TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s", __print_disk_name(__entry->disk), diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 6f5e0994862e..0d5f9bf95229 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -9,6 +9,8 @@ #undef pr_fmt #define pr_fmt(fmt) "null_blk: " fmt +#define NULL_ZONE_INVALID_WP ((sector_t)-1) + static inline sector_t mb_to_sects(unsigned long mb) { return ((sector_t)mb * SZ_1M) >> SECTOR_SHIFT; @@ -19,18 +21,6 @@ static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) return sect >> ilog2(dev->zone_size_sects); } -static inline void null_lock_zone_res(struct nullb_device *dev) -{ - if (dev->need_zone_res_mgmt) - spin_lock_irq(&dev->zone_res_lock); -} - -static inline void null_unlock_zone_res(struct nullb_device *dev) -{ - if (dev->need_zone_res_mgmt) - spin_unlock_irq(&dev->zone_res_lock); -} - static inline void null_init_zone_lock(struct nullb_device *dev, struct nullb_zone *zone) { @@ -58,7 +48,8 @@ static inline void null_unlock_zone(struct nullb_device *dev, mutex_unlock(&zone->mutex); } -int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) +int null_init_zoned_dev(struct nullb_device *dev, + struct queue_limits *lim) { sector_t dev_capacity_sects, zone_capacity_sects; struct nullb_zone *zone; @@ -83,6 +74,17 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) return -EINVAL; } + /* + * If a smaller zone capacity was requested, do not allow a smaller last + * zone at the same time as such zone configuration does not correspond + * to any real zoned device. + */ + if (dev->zone_capacity != dev->zone_size && + dev->size & (dev->zone_size - 1)) { + pr_err("A smaller last zone is not allowed with zone capacity smaller than zone size.\n"); + return -EINVAL; + } + zone_capacity_sects = mb_to_sects(dev->zone_capacity); dev_capacity_sects = mb_to_sects(dev->size); dev->zone_size_sects = mb_to_sects(dev->zone_size); @@ -102,6 +104,11 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) dev->zone_nr_conv); } + dev->zone_append_max_sectors = + min(ALIGN_DOWN(dev->zone_append_max_sectors, + dev->blocksize >> SECTOR_SHIFT), + zone_capacity_sects); + /* Max active zones has to be < nbr of seq zones in order to be enforceable */ if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) { dev->zone_max_active = 0; @@ -112,7 +119,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { dev->zone_max_open = dev->zone_max_active; pr_info("changed the maximum number of open zones to %u\n", - dev->nr_zones); + dev->zone_max_open); } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) { dev->zone_max_open = 0; pr_info("zone_max_open limit disabled, limit >= zone count\n"); @@ -138,7 +145,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) zone = &dev->zones[i]; null_init_zone_lock(dev, zone); - zone->start = zone->wp = sector; + zone->start = sector; if (zone->start + dev->zone_size_sects > dev_capacity_sects) zone->len = dev_capacity_sects - zone->start; else @@ -146,32 +153,35 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) zone->capacity = min_t(sector_t, zone->len, zone_capacity_sects); zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; - zone->cond = BLK_ZONE_COND_EMPTY; + if (dev->zone_full) { + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = zone->start + zone->capacity; + } else{ + zone->cond = BLK_ZONE_COND_EMPTY; + zone->wp = zone->start; + } sector += dev->zone_size_sects; } + lim->features |= BLK_FEAT_ZONED; + lim->chunk_sectors = dev->zone_size_sects; + lim->max_hw_zone_append_sectors = dev->zone_append_max_sectors; + lim->max_open_zones = dev->zone_max_open; + lim->max_active_zones = dev->zone_max_active; return 0; } int null_register_zoned_dev(struct nullb *nullb) { - struct nullb_device *dev = nullb->dev; struct request_queue *q = nullb->q; + struct gendisk *disk = nullb->disk; - disk_set_zoned(nullb->disk); - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); - blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); - blk_queue_chunk_sectors(q, dev->zone_size_sects); - nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0); - blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); - disk_set_max_open_zones(nullb->disk, dev->zone_max_open); - disk_set_max_active_zones(nullb->disk, dev->zone_max_active); - - if (queue_is_mq(q)) - return blk_revalidate_disk_zones(nullb->disk, NULL); + pr_info("%s: using %s zone append\n", + disk->disk_name, + queue_emulates_zone_append(q) ? "emulated" : "native"); - return 0; + return blk_revalidate_disk_zones(disk); } void null_free_zoned_dev(struct nullb_device *dev) @@ -245,35 +255,6 @@ size_t null_zone_valid_read_len(struct nullb *nullb, return (zone->wp - sector) << SECTOR_SHIFT; } -static blk_status_t __null_close_zone(struct nullb_device *dev, - struct nullb_zone *zone) -{ - switch (zone->cond) { - case BLK_ZONE_COND_CLOSED: - /* close operation on closed is not an error */ - return BLK_STS_OK; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_EXP_OPEN: - dev->nr_zones_exp_open--; - break; - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_FULL: - default: - return BLK_STS_IOERR; - } - - if (zone->wp == zone->start) { - zone->cond = BLK_ZONE_COND_EMPTY; - } else { - zone->cond = BLK_ZONE_COND_CLOSED; - dev->nr_zones_closed++; - } - - return BLK_STS_OK; -} - static void null_close_imp_open_zone(struct nullb_device *dev) { struct nullb_zone *zone; @@ -290,7 +271,13 @@ static void null_close_imp_open_zone(struct nullb_device *dev) zno = dev->zone_nr_conv; if (zone->cond == BLK_ZONE_COND_IMP_OPEN) { - __null_close_zone(dev, zone); + dev->nr_zones_imp_open--; + if (zone->wp == zone->start) { + zone->cond = BLK_ZONE_COND_EMPTY; + } else { + zone->cond = BLK_ZONE_COND_CLOSED; + dev->nr_zones_closed++; + } dev->imp_close_zone_no = zno; return; } @@ -378,76 +365,73 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, null_lock_zone(dev, zone); - if (zone->cond == BLK_ZONE_COND_FULL || - zone->cond == BLK_ZONE_COND_READONLY || - zone->cond == BLK_ZONE_COND_OFFLINE) { - /* Cannot write to the zone */ - ret = BLK_STS_IOERR; - goto unlock; - } - /* - * Regular writes must be at the write pointer position. - * Zone append writes are automatically issued at the write - * pointer and the position returned using the request or BIO - * sector. + * Regular writes must be at the write pointer position. Zone append + * writes are automatically issued at the write pointer and the position + * returned using the request sector. Note that we do not check the zone + * condition because for FULL, READONLY and OFFLINE zones, the sector + * check against the zone write pointer will always result in failing + * the command. */ if (append) { + if (WARN_ON_ONCE(!dev->zone_append_max_sectors) || + zone->wp == NULL_ZONE_INVALID_WP) { + ret = BLK_STS_IOERR; + goto unlock_zone; + } sector = zone->wp; - if (dev->queue_mode == NULL_Q_MQ) - cmd->rq->__sector = sector; - else - cmd->bio->bi_iter.bi_sector = sector; - } else if (sector != zone->wp) { - ret = BLK_STS_IOERR; - goto unlock; + blk_mq_rq_from_pdu(cmd)->__sector = sector; } - if (zone->wp + nr_sectors > zone->start + zone->capacity) { + if (sector != zone->wp || + zone->wp + nr_sectors > zone->start + zone->capacity) { ret = BLK_STS_IOERR; - goto unlock; + goto unlock_zone; } if (zone->cond == BLK_ZONE_COND_CLOSED || zone->cond == BLK_ZONE_COND_EMPTY) { - null_lock_zone_res(dev); + if (dev->need_zone_res_mgmt) { + spin_lock(&dev->zone_res_lock); - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) { - null_unlock_zone_res(dev); - goto unlock; - } - if (zone->cond == BLK_ZONE_COND_CLOSED) { - dev->nr_zones_closed--; - dev->nr_zones_imp_open++; - } else if (zone->cond == BLK_ZONE_COND_EMPTY) { - dev->nr_zones_imp_open++; - } + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + spin_unlock(&dev->zone_res_lock); + goto unlock_zone; + } + if (zone->cond == BLK_ZONE_COND_CLOSED) { + dev->nr_zones_closed--; + dev->nr_zones_imp_open++; + } else if (zone->cond == BLK_ZONE_COND_EMPTY) { + dev->nr_zones_imp_open++; + } - if (zone->cond != BLK_ZONE_COND_EXP_OPEN) - zone->cond = BLK_ZONE_COND_IMP_OPEN; + spin_unlock(&dev->zone_res_lock); + } - null_unlock_zone_res(dev); + zone->cond = BLK_ZONE_COND_IMP_OPEN; } ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); if (ret != BLK_STS_OK) - goto unlock; + goto unlock_zone; zone->wp += nr_sectors; if (zone->wp == zone->start + zone->capacity) { - null_lock_zone_res(dev); - if (zone->cond == BLK_ZONE_COND_EXP_OPEN) - dev->nr_zones_exp_open--; - else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) - dev->nr_zones_imp_open--; + if (dev->need_zone_res_mgmt) { + spin_lock(&dev->zone_res_lock); + if (zone->cond == BLK_ZONE_COND_EXP_OPEN) + dev->nr_zones_exp_open--; + else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) + dev->nr_zones_imp_open--; + spin_unlock(&dev->zone_res_lock); + } zone->cond = BLK_ZONE_COND_FULL; - null_unlock_zone_res(dev); } ret = BLK_STS_OK; -unlock: +unlock_zone: null_unlock_zone(dev, zone); return ret; @@ -461,54 +445,100 @@ static blk_status_t null_open_zone(struct nullb_device *dev, if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; - null_lock_zone_res(dev); - switch (zone->cond) { case BLK_ZONE_COND_EXP_OPEN: - /* open operation on exp open is not an error */ - goto unlock; + /* Open operation on exp open is not an error */ + return BLK_STS_OK; case BLK_ZONE_COND_EMPTY: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - break; case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; case BLK_ZONE_COND_CLOSED: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - dev->nr_zones_closed--; break; case BLK_ZONE_COND_FULL: default: - ret = BLK_STS_IOERR; - goto unlock; + return BLK_STS_IOERR; } - zone->cond = BLK_ZONE_COND_EXP_OPEN; - dev->nr_zones_exp_open++; + if (dev->need_zone_res_mgmt) { + spin_lock(&dev->zone_res_lock); -unlock: - null_unlock_zone_res(dev); + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + spin_unlock(&dev->zone_res_lock); + return ret; + } + break; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_CLOSED: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + spin_unlock(&dev->zone_res_lock); + return ret; + } + dev->nr_zones_closed--; + break; + default: + break; + } - return ret; + dev->nr_zones_exp_open++; + + spin_unlock(&dev->zone_res_lock); + } + + zone->cond = BLK_ZONE_COND_EXP_OPEN; + + return BLK_STS_OK; } static blk_status_t null_close_zone(struct nullb_device *dev, struct nullb_zone *zone) { - blk_status_t ret; - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; - null_lock_zone_res(dev); - ret = __null_close_zone(dev, zone); - null_unlock_zone_res(dev); + switch (zone->cond) { + case BLK_ZONE_COND_CLOSED: + /* close operation on closed is not an error */ + return BLK_STS_OK; + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + break; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_FULL: + default: + return BLK_STS_IOERR; + } - return ret; + if (dev->need_zone_res_mgmt) { + spin_lock(&dev->zone_res_lock); + + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + default: + break; + } + + if (zone->wp > zone->start) + dev->nr_zones_closed++; + + spin_unlock(&dev->zone_res_lock); + } + + if (zone->wp == zone->start) + zone->cond = BLK_ZONE_COND_EMPTY; + else + zone->cond = BLK_ZONE_COND_CLOSED; + + return BLK_STS_OK; } static blk_status_t null_finish_zone(struct nullb_device *dev, @@ -519,41 +549,47 @@ static blk_status_t null_finish_zone(struct nullb_device *dev, if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; - null_lock_zone_res(dev); + if (dev->need_zone_res_mgmt) { + spin_lock(&dev->zone_res_lock); - switch (zone->cond) { - case BLK_ZONE_COND_FULL: - /* finish operation on full is not an error */ - goto unlock; - case BLK_ZONE_COND_EMPTY: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - break; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_EXP_OPEN: - dev->nr_zones_exp_open--; - break; - case BLK_ZONE_COND_CLOSED: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - dev->nr_zones_closed--; - break; - default: - ret = BLK_STS_IOERR; - goto unlock; + switch (zone->cond) { + case BLK_ZONE_COND_FULL: + /* Finish operation on full is not an error */ + spin_unlock(&dev->zone_res_lock); + return BLK_STS_OK; + case BLK_ZONE_COND_EMPTY: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + spin_unlock(&dev->zone_res_lock); + return ret; + } + break; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + case BLK_ZONE_COND_CLOSED: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + spin_unlock(&dev->zone_res_lock); + return ret; + } + dev->nr_zones_closed--; + break; + default: + spin_unlock(&dev->zone_res_lock); + return BLK_STS_IOERR; + } + + spin_unlock(&dev->zone_res_lock); } zone->cond = BLK_ZONE_COND_FULL; zone->wp = zone->start + zone->len; -unlock: - null_unlock_zone_res(dev); - - return ret; + return BLK_STS_OK; } static blk_status_t null_reset_zone(struct nullb_device *dev, @@ -562,34 +598,33 @@ static blk_status_t null_reset_zone(struct nullb_device *dev, if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; - null_lock_zone_res(dev); + if (dev->need_zone_res_mgmt) { + spin_lock(&dev->zone_res_lock); - switch (zone->cond) { - case BLK_ZONE_COND_EMPTY: - /* reset operation on empty is not an error */ - null_unlock_zone_res(dev); - return BLK_STS_OK; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_EXP_OPEN: - dev->nr_zones_exp_open--; - break; - case BLK_ZONE_COND_CLOSED: - dev->nr_zones_closed--; - break; - case BLK_ZONE_COND_FULL: - break; - default: - null_unlock_zone_res(dev); - return BLK_STS_IOERR; + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + case BLK_ZONE_COND_CLOSED: + dev->nr_zones_closed--; + break; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_FULL: + break; + default: + spin_unlock(&dev->zone_res_lock); + return BLK_STS_IOERR; + } + + spin_unlock(&dev->zone_res_lock); } zone->cond = BLK_ZONE_COND_EMPTY; zone->wp = zone->start; - null_unlock_zone_res(dev); - if (dev->memory_backed) return null_handle_discard(dev, zone->start, zone->len); @@ -718,7 +753,7 @@ static void null_set_zone_cond(struct nullb_device *dev, zone->cond != BLK_ZONE_COND_OFFLINE) null_finish_zone(dev, zone); zone->cond = cond; - zone->wp = (sector_t)-1; + zone->wp = NULL_ZONE_INVALID_WP; } null_unlock_zone(dev, zone); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index d56d972aadb3..65b96c083b3c 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -71,7 +71,7 @@ #include <scsi/scsi_cmnd.h> #include <scsi/scsi_ioctl.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #define DRIVER_NAME "pktcdvd" @@ -340,8 +340,8 @@ static ssize_t device_map_show(const struct class *c, const struct class_attribu n += sysfs_emit_at(data, n, "%s %u:%u %u:%u\n", pd->disk->disk_name, MAJOR(pd->pkt_dev), MINOR(pd->pkt_dev), - MAJOR(pd->bdev_handle->bdev->bd_dev), - MINOR(pd->bdev_handle->bdev->bd_dev)); + MAJOR(file_bdev(pd->bdev_file)->bd_dev), + MINOR(file_bdev(pd->bdev_file)->bd_dev)); } mutex_unlock(&ctl_mutex); return n; @@ -438,7 +438,7 @@ static int pkt_seq_show(struct seq_file *m, void *p) int states[PACKET_NUM_STATES]; seq_printf(m, "Writer %s mapped to %pg:\n", pd->disk->disk_name, - pd->bdev_handle->bdev); + file_bdev(pd->bdev_file)); seq_printf(m, "\nSettings:\n"); seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2); @@ -498,8 +498,6 @@ static void pkt_debugfs_dev_new(struct pktcdvd_device *pd) if (!pkt_debugfs_root) return; pd->dfs_d_root = debugfs_create_dir(pd->disk->disk_name, pkt_debugfs_root); - if (!pd->dfs_d_root) - return; pd->dfs_f_info = debugfs_create_file("info", 0444, pd->dfs_d_root, pd, &pkt_seq_fops); @@ -715,7 +713,7 @@ static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *nod */ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc) { - struct request_queue *q = bdev_get_queue(pd->bdev_handle->bdev); + struct request_queue *q = bdev_get_queue(file_bdev(pd->bdev_file)); struct scsi_cmnd *scmd; struct request *rq; int ret = 0; @@ -828,6 +826,12 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd, */ static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio) { + /* + * Some CDRW drives can not handle writes larger than one packet, + * even if the size is a multiple of the packet size. + */ + bio->bi_opf |= REQ_NOMERGE; + spin_lock(&pd->iosched.lock); if (bio_data_dir(bio) == READ) bio_list_add(&pd->iosched.read_queue, bio); @@ -1048,7 +1052,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) continue; bio = pkt->r_bios[f]; - bio_init(bio, pd->bdev_handle->bdev, bio->bi_inline_vecs, 1, + bio_init(bio, file_bdev(pd->bdev_file), bio->bi_inline_vecs, 1, REQ_OP_READ); bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9); bio->bi_end_io = pkt_end_io_read; @@ -1264,7 +1268,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) struct device *ddev = disk_to_dev(pd->disk); int f; - bio_init(pkt->w_bio, pd->bdev_handle->bdev, pkt->w_bio->bi_inline_vecs, + bio_init(pkt->w_bio, file_bdev(pd->bdev_file), pkt->w_bio->bi_inline_vecs, pkt->frames, REQ_OP_WRITE); pkt->w_bio->bi_iter.bi_sector = pkt->sector; pkt->w_bio->bi_end_io = pkt_end_io_packet_write; @@ -2162,20 +2166,20 @@ static int pkt_open_dev(struct pktcdvd_device *pd, bool write) int ret; long lba; struct request_queue *q; - struct bdev_handle *bdev_handle; + struct file *bdev_file; /* * We need to re-open the cdrom device without O_NONBLOCK to be able * to read/write from/to it. It is already opened in O_NONBLOCK mode * so open should not fail. */ - bdev_handle = bdev_open_by_dev(pd->bdev_handle->bdev->bd_dev, + bdev_file = bdev_file_open_by_dev(file_bdev(pd->bdev_file)->bd_dev, BLK_OPEN_READ, pd, NULL); - if (IS_ERR(bdev_handle)) { - ret = PTR_ERR(bdev_handle); + if (IS_ERR(bdev_file)) { + ret = PTR_ERR(bdev_file); goto out; } - pd->open_bdev_handle = bdev_handle; + pd->f_open_bdev = bdev_file; ret = pkt_get_last_written(pd, &lba); if (ret) { @@ -2184,18 +2188,13 @@ static int pkt_open_dev(struct pktcdvd_device *pd, bool write) } set_capacity(pd->disk, lba << 2); - set_capacity_and_notify(pd->bdev_handle->bdev->bd_disk, lba << 2); + set_capacity_and_notify(file_bdev(pd->bdev_file)->bd_disk, lba << 2); - q = bdev_get_queue(pd->bdev_handle->bdev); + q = bdev_get_queue(file_bdev(pd->bdev_file)); if (write) { ret = pkt_open_write(pd); if (ret) goto out_putdev; - /* - * Some CDRW drives can not handle writes larger than one packet, - * even if the size is a multiple of the packet size. - */ - blk_queue_max_hw_sectors(q, pd->settings.size); set_bit(PACKET_WRITABLE, &pd->flags); } else { pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); @@ -2214,11 +2213,12 @@ static int pkt_open_dev(struct pktcdvd_device *pd, bool write) } dev_info(ddev, "%lukB available on disc\n", lba << 1); } + set_blocksize(bdev_file, CD_FRAMESIZE); return 0; out_putdev: - bdev_release(bdev_handle); + fput(bdev_file); out: return ret; } @@ -2237,8 +2237,8 @@ static void pkt_release_dev(struct pktcdvd_device *pd, int flush) pkt_lock_door(pd, 0); pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); - bdev_release(pd->open_bdev_handle); - pd->open_bdev_handle = NULL; + fput(pd->f_open_bdev); + pd->f_open_bdev = NULL; pkt_shrink_pktlist(pd); } @@ -2277,11 +2277,6 @@ static int pkt_open(struct gendisk *disk, blk_mode_t mode) ret = pkt_open_dev(pd, mode & BLK_OPEN_WRITE); if (ret) goto out_dec; - /* - * needed here as well, since ext2 (among others) may change - * the blocksize at mount time - */ - set_blocksize(disk->part0, CD_FRAMESIZE); } mutex_unlock(&ctl_mutex); mutex_unlock(&pktcdvd_mutex); @@ -2326,7 +2321,7 @@ static void pkt_end_io_read_cloned(struct bio *bio) static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio) { - struct bio *cloned_bio = bio_alloc_clone(pd->bdev_handle->bdev, bio, + struct bio *cloned_bio = bio_alloc_clone(file_bdev(pd->bdev_file), bio, GFP_NOIO, &pkt_bio_set); struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO); @@ -2338,9 +2333,9 @@ static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio) pkt_queue_bio(pd, cloned_bio); } -static void pkt_make_request_write(struct request_queue *q, struct bio *bio) +static void pkt_make_request_write(struct bio *bio) { - struct pktcdvd_device *pd = q->queuedata; + struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data; sector_t zone; struct packet_data *pkt; int was_empty, blocked_bio; @@ -2432,7 +2427,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio) static void pkt_submit_bio(struct bio *bio) { - struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata; + struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data; struct device *ddev = disk_to_dev(pd->disk); struct bio *split; @@ -2476,7 +2471,7 @@ static void pkt_submit_bio(struct bio *bio) split = bio; } - pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split); + pkt_make_request_write(split); } while (split != bio); return; @@ -2484,20 +2479,11 @@ end_io: bio_io_error(bio); } -static void pkt_init_queue(struct pktcdvd_device *pd) -{ - struct request_queue *q = pd->disk->queue; - - blk_queue_logical_block_size(q, CD_FRAMESIZE); - blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS); - q->queuedata = pd; -} - static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) { struct device *ddev = disk_to_dev(pd->disk); int i; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct scsi_device *sdev; if (pd->pkt_dev == dev) { @@ -2508,9 +2494,9 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) struct pktcdvd_device *pd2 = pkt_devs[i]; if (!pd2) continue; - if (pd2->bdev_handle->bdev->bd_dev == dev) { + if (file_bdev(pd2->bdev_file)->bd_dev == dev) { dev_err(ddev, "%pg already setup\n", - pd2->bdev_handle->bdev); + file_bdev(pd2->bdev_file)); return -EBUSY; } if (pd2->pkt_dev == dev) { @@ -2519,13 +2505,13 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) } } - bdev_handle = bdev_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_NDELAY, + bdev_file = bdev_file_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_NDELAY, NULL, NULL); - if (IS_ERR(bdev_handle)) - return PTR_ERR(bdev_handle); - sdev = scsi_device_from_queue(bdev_handle->bdev->bd_disk->queue); + if (IS_ERR(bdev_file)) + return PTR_ERR(bdev_file); + sdev = scsi_device_from_queue(file_bdev(bdev_file)->bd_disk->queue); if (!sdev) { - bdev_release(bdev_handle); + fput(bdev_file); return -EINVAL; } put_device(&sdev->sdev_gendev); @@ -2533,10 +2519,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); - pd->bdev_handle = bdev_handle; - set_blocksize(bdev_handle->bdev, CD_FRAMESIZE); - - pkt_init_queue(pd); + pd->bdev_file = bdev_file; atomic_set(&pd->cdrw.pending_bios, 0); pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->disk->disk_name); @@ -2546,11 +2529,11 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) } proc_create_single_data(pd->disk->disk_name, 0, pkt_proc, pkt_seq_show, pd); - dev_notice(ddev, "writer mapped to %pg\n", bdev_handle->bdev); + dev_notice(ddev, "writer mapped to %pg\n", file_bdev(bdev_file)); return 0; out_mem: - bdev_release(bdev_handle); + fput(bdev_file); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); return -ENOMEM; @@ -2605,9 +2588,9 @@ static unsigned int pkt_check_events(struct gendisk *disk, if (!pd) return 0; - if (!pd->bdev_handle) + if (!pd->bdev_file) return 0; - attached_disk = pd->bdev_handle->bdev->bd_disk; + attached_disk = file_bdev(pd->bdev_file)->bd_disk; if (!attached_disk || !attached_disk->fops->check_events) return 0; return attached_disk->fops->check_events(attached_disk, clearing); @@ -2634,6 +2617,11 @@ static const struct block_device_operations pktcdvd_ops = { */ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) { + struct queue_limits lim = { + .max_hw_sectors = PACKET_MAX_SECTORS, + .logical_block_size = CD_FRAMESIZE, + .features = BLK_FEAT_ROTATIONAL, + }; int idx; int ret = -ENOMEM; struct pktcdvd_device *pd; @@ -2673,10 +2661,11 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) pd->write_congestion_on = write_congestion_on; pd->write_congestion_off = write_congestion_off; - ret = -ENOMEM; - disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = blk_alloc_disk(&lim, NUMA_NO_NODE); + if (IS_ERR(disk)) { + ret = PTR_ERR(disk); goto out_mem; + } pd->disk = disk; disk->major = pktdev_major; disk->first_minor = idx; @@ -2692,7 +2681,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) goto out_mem2; /* inherit events of the host device */ - disk->events = pd->bdev_handle->bdev->bd_disk->events; + disk->events = file_bdev(pd->bdev_file)->bd_disk->events; ret = add_disk(disk); if (ret) @@ -2757,7 +2746,7 @@ static int pkt_remove_dev(dev_t pkt_dev) pkt_debugfs_dev_remove(pd); pkt_sysfs_dev_remove(pd); - bdev_release(pd->bdev_handle); + fput(pd->bdev_file); remove_proc_entry(pd->disk->disk_name, pkt_proc); dev_notice(ddev, "writer unmapped\n"); @@ -2784,7 +2773,7 @@ static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd) pd = pkt_find_dev_from_minor(ctrl_cmd->dev_index); if (pd) { - ctrl_cmd->dev = new_encode_dev(pd->bdev_handle->bdev->bd_dev); + ctrl_cmd->dev = new_encode_dev(file_bdev(pd->bdev_file)->bd_dev); ctrl_cmd->pkt_dev = new_encode_dev(pd->pkt_dev); } else { ctrl_cmd->dev = 0; @@ -2846,7 +2835,6 @@ static const struct file_operations pkt_ctl_fops = { .compat_ioctl = pkt_ctl_compat_ioctl, #endif .owner = THIS_MODULE, - .llseek = no_llseek, }; static struct miscdevice pkt_misc = { diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 36d7b36c60c7..dc9e4a14b885 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -382,7 +382,15 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) struct ps3disk_private *priv; int error; unsigned int devidx; - struct request_queue *queue; + struct queue_limits lim = { + .logical_block_size = dev->blk_size, + .max_hw_sectors = BOUNCE_SIZE >> 9, + .max_segments = -1, + .max_segment_size = BOUNCE_SIZE, + .dma_alignment = dev->blk_size - 1, + .features = BLK_FEAT_WRITE_CACHE | + BLK_FEAT_ROTATIONAL, + }; struct gendisk *gendisk; if (dev->blk_size < 512) { @@ -426,12 +434,11 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) ps3disk_identify(dev); - error = blk_mq_alloc_sq_tag_set(&priv->tag_set, &ps3disk_mq_ops, 1, - BLK_MQ_F_SHOULD_MERGE); + error = blk_mq_alloc_sq_tag_set(&priv->tag_set, &ps3disk_mq_ops, 1, 0); if (error) goto fail_teardown; - gendisk = blk_mq_alloc_disk(&priv->tag_set, dev); + gendisk = blk_mq_alloc_disk(&priv->tag_set, &lim, dev); if (IS_ERR(gendisk)) { dev_err(&dev->sbd.core, "%s:%u: blk_mq_alloc_disk failed\n", __func__, __LINE__); @@ -439,17 +446,6 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) goto fail_free_tag_set; } - queue = gendisk->queue; - - blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9); - blk_queue_dma_alignment(queue, dev->blk_size-1); - blk_queue_logical_block_size(queue, dev->blk_size); - - blk_queue_write_cache(queue, true, false); - - blk_queue_max_segments(queue, -1); - blk_queue_max_segment_size(queue, dev->bounce_size); - priv->gendisk = gendisk; gendisk->major = ps3disk_major; gendisk->first_minor = devidx * PS3DISK_MINORS; diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 38d42af01b25..bdcf083b45e2 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -730,10 +730,10 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) ps3vram_proc_init(dev); - gendisk = blk_alloc_disk(NUMA_NO_NODE); - if (!gendisk) { + gendisk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(gendisk)) { dev_err(&dev->core, "blk_alloc_disk failed\n"); - error = -ENOMEM; + error = PTR_ERR(gendisk); goto out_cache_cleanup; } diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 12b5d53ec856..faafd7ff43d6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -362,7 +362,7 @@ enum rbd_watch_state { enum rbd_lock_state { RBD_LOCK_STATE_UNLOCKED, RBD_LOCK_STATE_LOCKED, - RBD_LOCK_STATE_RELEASING, + RBD_LOCK_STATE_QUIESCING, }; /* WatchNotify::ClientId */ @@ -422,7 +422,7 @@ struct rbd_device { struct list_head running_list; struct completion acquire_wait; int acquire_err; - struct completion releasing_wait; + struct completion quiescing_wait; spinlock_t object_map_lock; u8 *object_map; @@ -525,7 +525,7 @@ static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) lockdep_assert_held(&rbd_dev->lock_rwsem); return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || - rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; + rbd_dev->lock_state == RBD_LOCK_STATE_QUIESCING; } static bool rbd_is_lock_owner(struct rbd_device *rbd_dev) @@ -575,7 +575,7 @@ static const struct attribute_group rbd_bus_group = { }; __ATTRIBUTE_GROUPS(rbd_bus); -static struct bus_type rbd_bus_type = { +static const struct bus_type rbd_bus_type = { .name = "rbd", .bus_groups = rbd_bus_groups, }; @@ -3457,13 +3457,14 @@ static void rbd_lock_del_request(struct rbd_img_request *img_req) lockdep_assert_held(&rbd_dev->lock_rwsem); spin_lock(&rbd_dev->lock_lists_lock); if (!list_empty(&img_req->lock_item)) { + rbd_assert(!list_empty(&rbd_dev->running_list)); list_del_init(&img_req->lock_item); - need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING && + need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_QUIESCING && list_empty(&rbd_dev->running_list)); } spin_unlock(&rbd_dev->lock_lists_lock); if (need_wakeup) - complete(&rbd_dev->releasing_wait); + complete(&rbd_dev->quiescing_wait); } static int rbd_img_exclusive_lock(struct rbd_img_request *img_req) @@ -3476,11 +3477,6 @@ static int rbd_img_exclusive_lock(struct rbd_img_request *img_req) if (rbd_lock_add_request(img_req)) return 1; - if (rbd_dev->opts->exclusive) { - WARN_ON(1); /* lock got released? */ - return -EROFS; - } - /* * Note the use of mod_delayed_work() in rbd_acquire_lock() * and cancel_delayed_work() in wake_lock_waiters(). @@ -4181,16 +4177,16 @@ static bool rbd_quiesce_lock(struct rbd_device *rbd_dev) /* * Ensure that all in-flight IO is flushed. */ - rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; - rbd_assert(!completion_done(&rbd_dev->releasing_wait)); + rbd_dev->lock_state = RBD_LOCK_STATE_QUIESCING; + rbd_assert(!completion_done(&rbd_dev->quiescing_wait)); if (list_empty(&rbd_dev->running_list)) return true; up_write(&rbd_dev->lock_rwsem); - wait_for_completion(&rbd_dev->releasing_wait); + wait_for_completion(&rbd_dev->quiescing_wait); down_write(&rbd_dev->lock_rwsem); - if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) + if (rbd_dev->lock_state != RBD_LOCK_STATE_QUIESCING) return false; rbd_assert(list_empty(&rbd_dev->running_list)); @@ -4601,6 +4597,10 @@ static void rbd_reacquire_lock(struct rbd_device *rbd_dev) rbd_warn(rbd_dev, "failed to update lock cookie: %d", ret); + if (rbd_dev->opts->exclusive) + rbd_warn(rbd_dev, + "temporarily releasing lock on exclusive mapping"); + /* * Lock cookie cannot be updated on older OSDs, so do * a manual release and queue an acquire. @@ -4949,16 +4949,21 @@ static const struct blk_mq_ops rbd_mq_ops = { static int rbd_init_disk(struct rbd_device *rbd_dev) { struct gendisk *disk; - struct request_queue *q; unsigned int objset_bytes = rbd_dev->layout.object_size * rbd_dev->layout.stripe_count; + struct queue_limits lim = { + .max_hw_sectors = objset_bytes >> SECTOR_SHIFT, + .io_opt = objset_bytes, + .io_min = rbd_dev->opts->alloc_size, + .max_segments = USHRT_MAX, + .max_segment_size = UINT_MAX, + }; int err; memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); rbd_dev->tag_set.ops = &rbd_mq_ops; rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; rbd_dev->tag_set.numa_node = NUMA_NO_NODE; - rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; rbd_dev->tag_set.nr_hw_queues = num_present_cpus(); rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request); @@ -4966,12 +4971,20 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (err) return err; - disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev); + if (rbd_dev->opts->trim) { + lim.discard_granularity = rbd_dev->opts->alloc_size; + lim.max_hw_discard_sectors = objset_bytes >> SECTOR_SHIFT; + lim.max_write_zeroes_sectors = objset_bytes >> SECTOR_SHIFT; + } + + if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) + lim.features |= BLK_FEAT_STABLE_WRITES; + + disk = blk_mq_alloc_disk(&rbd_dev->tag_set, &lim, rbd_dev); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_tag_set; } - q = disk->queue; snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", rbd_dev->dev_id); @@ -4983,26 +4996,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) disk->minors = RBD_MINORS_PER_MAJOR; disk->fops = &rbd_bd_ops; disk->private_data = rbd_dev; - - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ - - blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT); - q->limits.max_sectors = queue_max_hw_sectors(q); - blk_queue_max_segments(q, USHRT_MAX); - blk_queue_max_segment_size(q, UINT_MAX); - blk_queue_io_min(q, rbd_dev->opts->alloc_size); - blk_queue_io_opt(q, rbd_dev->opts->alloc_size); - - if (rbd_dev->opts->trim) { - q->limits.discard_granularity = rbd_dev->opts->alloc_size; - blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT); - blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT); - } - - if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); - rbd_dev->disk = disk; return 0; @@ -5382,7 +5375,7 @@ static struct rbd_device *__rbd_dev_create(struct rbd_spec *spec) INIT_LIST_HEAD(&rbd_dev->acquiring_list); INIT_LIST_HEAD(&rbd_dev->running_list); init_completion(&rbd_dev->acquire_wait); - init_completion(&rbd_dev->releasing_wait); + init_completion(&rbd_dev->quiescing_wait); spin_lock_init(&rbd_dev->object_map_lock); @@ -6588,11 +6581,6 @@ static int rbd_add_acquire_lock(struct rbd_device *rbd_dev) if (ret) return ret; - /* - * The lock may have been released by now, unless automatic lock - * transitions are disabled. - */ - rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev)); return 0; } @@ -7293,8 +7281,10 @@ static ssize_t do_rbd_remove(const char *buf, size_t count) * Prevent new IO from being queued and wait for existing * IO to complete/fail. */ - blk_mq_freeze_queue(rbd_dev->disk->queue); + unsigned int memflags = blk_mq_freeze_queue(rbd_dev->disk->queue); + blk_mark_disk_dead(rbd_dev->disk); + blk_mq_unfreeze_queue(rbd_dev->disk->queue, memflags); } del_gendisk(rbd_dev->disk); diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 39887556cf95..6ea7c12e3a87 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -475,7 +475,7 @@ void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev) } } -static struct kobj_type rnbd_dev_ktype = { +static const struct kobj_type rnbd_dev_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = rnbd_dev_groups, }; diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 4044c369d22a..82467ecde7ec 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1209,8 +1209,7 @@ static int setup_mq_tags(struct rnbd_clt_session *sess) tag_set->ops = &rnbd_mq_ops; tag_set->queue_depth = sess->queue_depth; tag_set->numa_node = NUMA_NO_NODE; - tag_set->flags = BLK_MQ_F_SHOULD_MERGE | - BLK_MQ_F_TAG_QUEUE_SHARED; + tag_set->flags = BLK_MQ_F_TAG_QUEUE_SHARED; tag_set->cmd_size = sizeof(struct rnbd_iu) + RNBD_RDMA_SGL_SIZE; /* for HCTX_TYPE_DEFAULT, HCTX_TYPE_READ, HCTX_TYPE_POLL */ @@ -1329,43 +1328,6 @@ static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev) } } -static void setup_request_queue(struct rnbd_clt_dev *dev, - struct rnbd_msg_open_rsp *rsp) -{ - blk_queue_logical_block_size(dev->queue, - le16_to_cpu(rsp->logical_block_size)); - blk_queue_physical_block_size(dev->queue, - le16_to_cpu(rsp->physical_block_size)); - blk_queue_max_hw_sectors(dev->queue, - dev->sess->max_io_size / SECTOR_SIZE); - - /* - * we don't support discards to "discontiguous" segments - * in on request - */ - blk_queue_max_discard_segments(dev->queue, 1); - - blk_queue_max_discard_sectors(dev->queue, - le32_to_cpu(rsp->max_discard_sectors)); - dev->queue->limits.discard_granularity = - le32_to_cpu(rsp->discard_granularity); - dev->queue->limits.discard_alignment = - le32_to_cpu(rsp->discard_alignment); - if (le16_to_cpu(rsp->secure_discard)) - blk_queue_max_secure_erase_sectors(dev->queue, - le32_to_cpu(rsp->max_discard_sectors)); - blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue); - blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue); - blk_queue_max_segments(dev->queue, dev->sess->max_segments); - blk_queue_io_opt(dev->queue, dev->sess->max_io_size); - blk_queue_virt_boundary(dev->queue, SZ_4K - 1); - blk_queue_write_cache(dev->queue, - !!(rsp->cache_policy & RNBD_WRITEBACK), - !!(rsp->cache_policy & RNBD_FUA)); - blk_queue_max_write_zeroes_sectors(dev->queue, - le32_to_cpu(rsp->max_write_zeroes_sectors)); -} - static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, struct rnbd_msg_open_rsp *rsp, int idx) { @@ -1389,10 +1351,6 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, if (dev->access_mode == RNBD_ACCESS_RO) set_disk_ro(dev->gd, true); - /* - * Network device does not need rotational - */ - blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue); err = add_disk(dev->gd); if (err) put_disk(dev->gd); @@ -1403,18 +1361,41 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, static int rnbd_client_setup_device(struct rnbd_clt_dev *dev, struct rnbd_msg_open_rsp *rsp) { + struct queue_limits lim = { + .logical_block_size = le16_to_cpu(rsp->logical_block_size), + .physical_block_size = le16_to_cpu(rsp->physical_block_size), + .io_opt = dev->sess->max_io_size, + .max_hw_sectors = dev->sess->max_io_size / SECTOR_SIZE, + .max_hw_discard_sectors = le32_to_cpu(rsp->max_discard_sectors), + .discard_granularity = le32_to_cpu(rsp->discard_granularity), + .discard_alignment = le32_to_cpu(rsp->discard_alignment), + .max_segments = dev->sess->max_segments, + .virt_boundary_mask = SZ_4K - 1, + .max_write_zeroes_sectors = + le32_to_cpu(rsp->max_write_zeroes_sectors), + }; int idx = dev->clt_device_id; dev->size = le64_to_cpu(rsp->nsectors) * le16_to_cpu(rsp->logical_block_size); - dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, dev); + if (rsp->secure_discard) { + lim.max_secure_erase_sectors = + le32_to_cpu(rsp->max_discard_sectors); + } + + if (rsp->cache_policy & RNBD_WRITEBACK) { + lim.features |= BLK_FEAT_WRITE_CACHE; + if (rsp->cache_policy & RNBD_FUA) + lim.features |= BLK_FEAT_FUA; + } + + dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, &lim, dev); if (IS_ERR(dev->gd)) return PTR_ERR(dev->gd); dev->queue = dev->gd->queue; rnbd_init_mq_hw_queues(dev); - setup_request_queue(dev, rsp); return rnbd_clt_setup_gen_disk(dev, rsp, idx); } diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index cba6ba43c2c2..64780094442c 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -33,7 +33,7 @@ static void rnbd_srv_dev_release(struct kobject *kobj) kfree(dev); } -static struct kobj_type dev_ktype = { +static const struct kobj_type dev_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = rnbd_srv_dev_release }; @@ -184,7 +184,7 @@ static void rnbd_srv_sess_dev_release(struct kobject *kobj) rnbd_destroy_sess_dev(sess_dev, sess_dev->keep_id); } -static struct kobj_type rnbd_srv_sess_dev_ktype = { +static const struct kobj_type rnbd_srv_sess_dev_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = rnbd_srv_sess_dev_release, }; diff --git a/drivers/block/rnbd/rnbd-srv-trace.h b/drivers/block/rnbd/rnbd-srv-trace.h index 8dedf73bdd28..89d0bcb17195 100644 --- a/drivers/block/rnbd/rnbd-srv-trace.h +++ b/drivers/block/rnbd/rnbd-srv-trace.h @@ -27,7 +27,7 @@ DECLARE_EVENT_CLASS(rnbd_srv_link_class, TP_fast_assign( __entry->qdepth = srv->queue_depth; - __assign_str(sessname, srv->sessname); + __assign_str(sessname); ), TP_printk("sessname: %s qdepth: %d", @@ -85,7 +85,7 @@ TRACE_EVENT(process_rdma, ), TP_fast_assign( - __assign_str(sessname, srv->sessname); + __assign_str(sessname); __entry->dir = id->dir; __entry->ver = srv->ver; __entry->device_id = le32_to_cpu(msg->device_id); @@ -130,7 +130,7 @@ TRACE_EVENT(process_msg_sess_info, __entry->proto_ver = srv->ver; __entry->clt_ver = msg->ver; __entry->srv_ver = RNBD_PROTO_VER_MAJOR; - __assign_str(sessname, srv->sessname); + __assign_str(sessname); ), TP_printk("Session %s using proto-ver %d (clt-ver: %d, srv-ver: %d)", @@ -165,8 +165,8 @@ TRACE_EVENT(process_msg_open, TP_fast_assign( __entry->access_mode = msg->access_mode; - __assign_str(sessname, srv->sessname); - __assign_str(dev_name, msg->dev_name); + __assign_str(sessname); + __assign_str(dev_name); ), TP_printk("Open message received: session='%s' path='%s' access_mode=%s", @@ -189,7 +189,7 @@ TRACE_EVENT(process_msg_close, TP_fast_assign( __entry->device_id = le32_to_cpu(msg->device_id); - __assign_str(sessname, srv->sessname); + __assign_str(sessname); ), TP_printk("Close message received: session='%s' device id='%d'", diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 3a0d5dcec6f2..2ee6e9bd4e28 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -145,22 +145,29 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, priv->sess_dev = sess_dev; priv->id = id; - bio = bio_alloc(sess_dev->bdev_handle->bdev, 1, + bio = bio_alloc(file_bdev(sess_dev->bdev_file), 1, rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL); if (bio_add_page(bio, virt_to_page(data), datalen, offset_in_page(data)) != datalen) { - rnbd_srv_err(sess_dev, "Failed to map data to bio\n"); + rnbd_srv_err_rl(sess_dev, "Failed to map data to bio\n"); err = -EINVAL; goto bio_put; } + bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw)); + if (bio_has_data(bio) && + bio->bi_iter.bi_size != le32_to_cpu(msg->bi_size)) { + rnbd_srv_err_rl(sess_dev, "Datalen mismatch: bio bi_size (%u), bi_size (%u)\n", + bio->bi_iter.bi_size, msg->bi_size); + err = -EINVAL; + goto bio_put; + } bio->bi_end_io = rnbd_dev_bi_end_io; bio->bi_private = priv; bio->bi_iter.bi_sector = le64_to_cpu(msg->sector); - bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size); prio = srv_sess->ver < RNBD_PROTO_VER_MAJOR || usrlen < sizeof(*msg) ? 0 : le16_to_cpu(msg->prio); - bio_set_prio(bio, prio); + bio->bi_ioprio = prio; submit_bio(bio); @@ -219,7 +226,7 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id) rnbd_put_sess_dev(sess_dev); wait_for_completion(&dc); /* wait for inflights to drop to zero */ - bdev_release(sess_dev->bdev_handle); + fput(sess_dev->bdev_file); mutex_lock(&sess_dev->dev->lock); list_del(&sess_dev->dev_list); if (!sess_dev->readonly) @@ -534,7 +541,7 @@ rnbd_srv_get_or_create_srv_dev(struct block_device *bdev, static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, struct rnbd_srv_sess_dev *sess_dev) { - struct block_device *bdev = sess_dev->bdev_handle->bdev; + struct block_device *bdev = file_bdev(sess_dev->bdev_file); rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP); rsp->device_id = cpu_to_le32(sess_dev->device_id); @@ -560,7 +567,7 @@ static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, static struct rnbd_srv_sess_dev * rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess, const struct rnbd_msg_open *open_msg, - struct bdev_handle *handle, bool readonly, + struct file *bdev_file, bool readonly, struct rnbd_srv_dev *srv_dev) { struct rnbd_srv_sess_dev *sdev = rnbd_sess_dev_alloc(srv_sess); @@ -572,7 +579,7 @@ rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess, strscpy(sdev->pathname, open_msg->dev_name, sizeof(sdev->pathname)); - sdev->bdev_handle = handle; + sdev->bdev_file = bdev_file; sdev->sess = srv_sess; sdev->dev = srv_dev; sdev->readonly = readonly; @@ -678,7 +685,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, struct rnbd_srv_dev *srv_dev; struct rnbd_srv_sess_dev *srv_sess_dev; const struct rnbd_msg_open *open_msg = msg; - struct bdev_handle *bdev_handle; + struct file *bdev_file; blk_mode_t open_flags = BLK_OPEN_READ; char *full_path; struct rnbd_msg_open_rsp *rsp = data; @@ -716,15 +723,15 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, goto reject; } - bdev_handle = bdev_open_by_path(full_path, open_flags, NULL, NULL); - if (IS_ERR(bdev_handle)) { - ret = PTR_ERR(bdev_handle); + bdev_file = bdev_file_open_by_path(full_path, open_flags, NULL, NULL); + if (IS_ERR(bdev_file)) { + ret = PTR_ERR(bdev_file); pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %pe\n", - full_path, srv_sess->sessname, bdev_handle); + full_path, srv_sess->sessname, bdev_file); goto free_path; } - srv_dev = rnbd_srv_get_or_create_srv_dev(bdev_handle->bdev, srv_sess, + srv_dev = rnbd_srv_get_or_create_srv_dev(file_bdev(bdev_file), srv_sess, open_msg->access_mode); if (IS_ERR(srv_dev)) { pr_err("Opening device '%s' on session %s failed, creating srv_dev failed, err: %pe\n", @@ -734,7 +741,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, } srv_sess_dev = rnbd_srv_create_set_sess_dev(srv_sess, open_msg, - bdev_handle, + bdev_file, open_msg->access_mode == RNBD_ACCESS_RO, srv_dev); if (IS_ERR(srv_sess_dev)) { @@ -750,7 +757,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, */ mutex_lock(&srv_dev->lock); if (!srv_dev->dev_kobj.state_in_sysfs) { - ret = rnbd_srv_create_dev_sysfs(srv_dev, bdev_handle->bdev); + ret = rnbd_srv_create_dev_sysfs(srv_dev, file_bdev(bdev_file)); if (ret) { mutex_unlock(&srv_dev->lock); rnbd_srv_err(srv_sess_dev, @@ -793,7 +800,7 @@ srv_dev_put: } rnbd_put_srv_dev(srv_dev); blkdev_put: - bdev_release(bdev_handle); + fput(bdev_file); free_path: kfree(full_path); reject: diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h index 343cc682b617..18d873808b8d 100644 --- a/drivers/block/rnbd/rnbd-srv.h +++ b/drivers/block/rnbd/rnbd-srv.h @@ -46,7 +46,7 @@ struct rnbd_srv_dev { struct rnbd_srv_sess_dev { /* Entry inside rnbd_srv_dev struct */ struct list_head dev_list; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct rnbd_srv_session *sess; struct rnbd_srv_dev *dev; struct kobject kobj; diff --git a/drivers/block/rnull.rs b/drivers/block/rnull.rs new file mode 100644 index 000000000000..ddf3629d8894 --- /dev/null +++ b/drivers/block/rnull.rs @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! This is a Rust implementation of the C null block driver. +//! +//! Supported features: +//! +//! - blk-mq interface +//! - direct completion +//! - block size 4k +//! +//! The driver is not configurable. + +use kernel::{ + alloc::flags, + block::mq::{ + self, + gen_disk::{self, GenDisk}, + Operations, TagSet, + }, + error::Result, + new_mutex, pr_info, + prelude::*, + sync::{Arc, Mutex}, + types::ARef, +}; + +module! { + type: NullBlkModule, + name: "rnull_mod", + author: "Andreas Hindborg", + description: "Rust implementation of the C null block driver", + license: "GPL v2", +} + +#[pin_data] +struct NullBlkModule { + #[pin] + _disk: Mutex<GenDisk<NullBlkDevice>>, +} + +impl kernel::InPlaceModule for NullBlkModule { + fn init(_module: &'static ThisModule) -> impl PinInit<Self, Error> { + pr_info!("Rust null_blk loaded\n"); + + // Use a immediately-called closure as a stable `try` block + let disk = /* try */ (|| { + let tagset = Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?; + + gen_disk::GenDiskBuilder::new() + .capacity_sectors(4096 << 11) + .logical_block_size(4096)? + .physical_block_size(4096)? + .rotational(false) + .build(format_args!("rnullb{}", 0), tagset) + })(); + + try_pin_init!(Self { + _disk <- new_mutex!(disk?, "nullb:disk"), + }) + } +} + +struct NullBlkDevice; + +#[vtable] +impl Operations for NullBlkDevice { + #[inline(always)] + fn queue_rq(rq: ARef<mq::Request<Self>>, _is_last: bool) -> Result { + mq::Request::end_ok(rq) + .map_err(|_e| kernel::error::code::EIO) + // We take no refcounts on the request, so we expect to be able to + // end the request. The request reference must be unique at this + // point, and so `end_ok` cannot fail. + .expect("Fatal error - expected to be able to end request"); + + Ok(()) + } + + fn commit_rqs() {} +} diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 7bf4b48e2282..282f81616a78 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -28,7 +28,7 @@ static char version[] = DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n"; -MODULE_AUTHOR("David S. Miller (davem@davemloft.net)"); +MODULE_AUTHOR("David S. Miller <davem@davemloft.net>"); MODULE_DESCRIPTION("Sun LDOM virtual disk client driver"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_MODULE_VERSION); @@ -784,6 +784,15 @@ static const struct blk_mq_ops vdc_mq_ops = { static int probe_disk(struct vdc_port *port) { + struct queue_limits lim = { + .physical_block_size = port->vdisk_phys_blksz, + .max_hw_sectors = port->max_xfer_size, + /* Each segment in a request is up to an aligned page in size. */ + .seg_boundary_mask = PAGE_SIZE - 1, + .max_segment_size = PAGE_SIZE, + .max_segments = port->ring_cookies, + .features = BLK_FEAT_ROTATIONAL, + }; struct request_queue *q; struct gendisk *g; int err; @@ -820,11 +829,11 @@ static int probe_disk(struct vdc_port *port) } err = blk_mq_alloc_sq_tag_set(&port->tag_set, &vdc_mq_ops, - VDC_TX_RING_SIZE, BLK_MQ_F_SHOULD_MERGE); + VDC_TX_RING_SIZE, 0); if (err) return err; - g = blk_mq_alloc_disk(&port->tag_set, port); + g = blk_mq_alloc_disk(&port->tag_set, &lim, port); if (IS_ERR(g)) { printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n", port->vio.name); @@ -835,12 +844,6 @@ static int probe_disk(struct vdc_port *port) port->disk = g; q = g->queue; - /* Each segment in a request is up to an aligned page in size. */ - blk_queue_segment_boundary(q, PAGE_SIZE - 1); - blk_queue_max_segment_size(q, PAGE_SIZE); - - blk_queue_max_segments(q, port->ring_cookies); - blk_queue_max_hw_sectors(q, port->max_xfer_size); g->major = vdc_major; g->first_minor = port->vio.vdev->dev_no << PARTITION_SHIFT; g->minors = 1 << PARTITION_SHIFT; @@ -872,8 +875,6 @@ static int probe_disk(struct vdc_port *port) } } - blk_queue_physical_block_size(q, port->vdisk_phys_blksz); - pr_info(PFX "%s: %u sectors (%u MB) protocol %d.%d\n", g->disk_name, port->vdisk_size, (port->vdisk_size >> (20 - 9)), @@ -917,12 +918,12 @@ struct vdc_check_port_data { char *type; }; -static int vdc_device_probed(struct device *dev, void *arg) +static int vdc_device_probed(struct device *dev, const void *arg) { struct vio_dev *vdev = to_vio_dev(dev); - struct vdc_check_port_data *port_data; + const struct vdc_check_port_data *port_data; - port_data = (struct vdc_check_port_data *)arg; + port_data = (const struct vdc_check_port_data *)arg; if ((vdev->dev_no == port_data->dev_no) && (!(strcmp((char *)&vdev->type, port_data->type))) && @@ -1112,6 +1113,7 @@ static void vdc_requeue_inflight(struct vdc_port *port) static void vdc_queue_drain(struct vdc_port *port) { struct request_queue *q = port->disk->queue; + unsigned int memflags; /* * Mark the queue as draining, then freeze/quiesce to ensure @@ -1120,13 +1122,13 @@ static void vdc_queue_drain(struct vdc_port *port) port->drain = 1; spin_unlock_irq(&port->vio.lock); - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); spin_lock_irq(&port->vio.lock); port->drain = 0; blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); } static void vdc_ldc_reset_timer_work(struct work_struct *work) diff --git a/drivers/block/swim.c b/drivers/block/swim.c index f85b6af414b4..eda33c5eb5e2 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -787,6 +787,9 @@ static void swim_cleanup_floppy_disk(struct floppy_state *fs) static int swim_floppy_init(struct swim_priv *swd) { + struct queue_limits lim = { + .features = BLK_FEAT_ROTATIONAL, + }; int err; int drive; struct swim __iomem *base = swd->base; @@ -815,12 +818,12 @@ static int swim_floppy_init(struct swim_priv *swd) for (drive = 0; drive < swd->floppy_count; drive++) { err = blk_mq_alloc_sq_tag_set(&swd->unit[drive].tag_set, - &swim_mq_ops, 2, BLK_MQ_F_SHOULD_MERGE); + &swim_mq_ops, 2, 0); if (err) goto exit_put_disks; swd->unit[drive].disk = - blk_mq_alloc_disk(&swd->unit[drive].tag_set, + blk_mq_alloc_disk(&swd->unit[drive].tag_set, &lim, &swd->unit[drive]); if (IS_ERR(swd->unit[drive].disk)) { blk_mq_free_tag_set(&swd->unit[drive].tag_set); @@ -916,7 +919,7 @@ out: return ret; } -static int swim_remove(struct platform_device *dev) +static void swim_remove(struct platform_device *dev) { struct swim_priv *swd = platform_get_drvdata(dev); int drive; @@ -937,8 +940,6 @@ static int swim_remove(struct platform_device *dev) release_mem_region(res->start, resource_size(res)); kfree(swd); - - return 0; } static struct platform_driver swim_driver = { diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index c2bc85826358..3aedcb5add61 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -840,6 +840,7 @@ static int grab_drive(struct floppy_state *fs, enum swim_state state, static void release_drive(struct floppy_state *fs) { struct request_queue *q = disks[fs->index]->queue; + unsigned int memflags; unsigned long flags; swim3_dbg("%s", "-> release drive\n"); @@ -848,10 +849,10 @@ static void release_drive(struct floppy_state *fs) fs->state = idle; spin_unlock_irqrestore(&swim3_lock, flags); - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); } static int fd_eject(struct floppy_state *fs) @@ -1189,6 +1190,9 @@ static int swim3_add_device(struct macio_dev *mdev, int index) static int swim3_attach(struct macio_dev *mdev, const struct of_device_id *match) { + struct queue_limits lim = { + .features = BLK_FEAT_ROTATIONAL, + }; struct floppy_state *fs; struct gendisk *disk; int rc; @@ -1205,12 +1209,11 @@ static int swim3_attach(struct macio_dev *mdev, fs = &floppy_states[floppy_count]; memset(fs, 0, sizeof(*fs)); - rc = blk_mq_alloc_sq_tag_set(&fs->tag_set, &swim3_mq_ops, 2, - BLK_MQ_F_SHOULD_MERGE); + rc = blk_mq_alloc_sq_tag_set(&fs->tag_set, &swim3_mq_ops, 2, 0); if (rc) goto out_unregister; - disk = blk_mq_alloc_disk(&fs->tag_set, fs); + disk = blk_mq_alloc_disk(&fs->tag_set, &lim, fs); if (IS_ERR(disk)) { rc = PTR_ERR(disk); goto out_free_tag_set; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 1dfb2e77898b..ca9a67b5b537 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -48,6 +48,9 @@ #define UBLK_MINORS (1U << MINORBITS) +/* private ioctl command mirror */ +#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) + /* All UBLK_F_* have to be included into UBLK_F_ALL */ #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ | UBLK_F_URING_CMD_COMP_IN_TASK \ @@ -57,7 +60,12 @@ | UBLK_F_UNPRIVILEGED_DEV \ | UBLK_F_CMD_IOCTL_ENCODE \ | UBLK_F_USER_COPY \ - | UBLK_F_ZONED) + | UBLK_F_ZONED \ + | UBLK_F_USER_RECOVERY_FAIL_IO) + +#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ + | UBLK_F_USER_RECOVERY_REISSUE \ + | UBLK_F_USER_RECOVERY_FAIL_IO) /* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL \ @@ -68,9 +76,6 @@ struct ublk_rq_data { struct llist_node node; struct kref ref; - __u64 sector; - __u32 operation; - __u32 nr_zones; }; struct ublk_uring_cmd_pdu { @@ -143,6 +148,7 @@ struct ublk_queue { bool force_abort; bool timeout; bool canceling; + bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ unsigned short nr_io_ready; /* how many ios setup */ spinlock_t cancel_lock; struct ublk_device *dev; @@ -179,8 +185,7 @@ struct ublk_device { unsigned int nr_queues_ready; unsigned int nr_privileged_daemon; - struct work_struct quiesce_work; - struct work_struct stop_work; + struct work_struct nosrv_work; }; /* header of ublk_params */ @@ -211,6 +216,33 @@ static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq) #ifdef CONFIG_BLK_DEV_ZONED +struct ublk_zoned_report_desc { + __u64 sector; + __u32 operation; + __u32 nr_zones; +}; + +static DEFINE_XARRAY(ublk_zoned_report_descs); + +static int ublk_zoned_insert_report_desc(const struct request *req, + struct ublk_zoned_report_desc *desc) +{ + return xa_insert(&ublk_zoned_report_descs, (unsigned long)req, + desc, GFP_KERNEL); +} + +static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc( + const struct request *req) +{ + return xa_erase(&ublk_zoned_report_descs, (unsigned long)req); +} + +static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc( + const struct request *req) +{ + return xa_load(&ublk_zoned_report_descs, (unsigned long)req); +} + static int ublk_get_nr_zones(const struct ublk_device *ub) { const struct ublk_param_basic *p = &ub->params.basic; @@ -221,7 +253,7 @@ static int ublk_get_nr_zones(const struct ublk_device *ub) static int ublk_revalidate_disk_zones(struct ublk_device *ub) { - return blk_revalidate_disk_zones(ub->ub_disk, NULL); + return blk_revalidate_disk_zones(ub->ub_disk); } static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) @@ -246,21 +278,9 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) return 0; } -static int ublk_dev_param_zoned_apply(struct ublk_device *ub) +static void ublk_dev_param_zoned_apply(struct ublk_device *ub) { - const struct ublk_param_zoned *p = &ub->params.zoned; - - disk_set_zoned(ub->ub_disk); - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue); - blk_queue_required_elevator_features(ub->ub_disk->queue, - ELEVATOR_F_ZBD_SEQ_WRITE); - disk_set_max_active_zones(ub->ub_disk, p->max_active_zones); - disk_set_max_open_zones(ub->ub_disk, p->max_open_zones); - blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors); - ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); - - return 0; } /* Based on virtblk_alloc_report_buffer */ @@ -317,7 +337,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector, unsigned int zones_in_request = min_t(unsigned int, remaining_zones, max_zones_per_request); struct request *req; - struct ublk_rq_data *pdu; + struct ublk_zoned_report_desc desc; blk_status_t status; memset(buffer, 0, buffer_length); @@ -328,20 +348,23 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector, goto out; } - pdu = blk_mq_rq_to_pdu(req); - pdu->operation = UBLK_IO_OP_REPORT_ZONES; - pdu->sector = sector; - pdu->nr_zones = zones_in_request; + desc.operation = UBLK_IO_OP_REPORT_ZONES; + desc.sector = sector; + desc.nr_zones = zones_in_request; + ret = ublk_zoned_insert_report_desc(req, &desc); + if (ret) + goto free_req; ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length, GFP_KERNEL); - if (ret) { - blk_mq_free_request(req); - goto out; - } + if (ret) + goto erase_desc; status = blk_execute_rq(req, 0); ret = blk_status_to_errno(status); +erase_desc: + ublk_zoned_erase_report_desc(req); +free_req: blk_mq_free_request(req); if (ret) goto out; @@ -375,7 +398,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, { struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); struct ublk_io *io = &ubq->ios[req->tag]; - struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req); + struct ublk_zoned_report_desc *desc; u32 ublk_op; switch (req_op(req)) { @@ -398,12 +421,15 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, ublk_op = UBLK_IO_OP_ZONE_RESET_ALL; break; case REQ_OP_DRV_IN: - ublk_op = pdu->operation; + desc = ublk_zoned_get_report_desc(req); + if (!desc) + return BLK_STS_IOERR; + ublk_op = desc->operation; switch (ublk_op) { case UBLK_IO_OP_REPORT_ZONES: iod->op_flags = ublk_op | ublk_req_build_flags(req); - iod->nr_zones = pdu->nr_zones; - iod->start_sector = pdu->sector; + iod->nr_zones = desc->nr_zones; + iod->start_sector = desc->sector; return BLK_STS_OK; default: return BLK_STS_IOERR; @@ -432,9 +458,8 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) return -EOPNOTSUPP; } -static int ublk_dev_param_zoned_apply(struct ublk_device *ub) +static void ublk_dev_param_zoned_apply(struct ublk_device *ub) { - return -EOPNOTSUPP; } static int ublk_revalidate_disk_zones(struct ublk_device *ub) @@ -495,44 +520,14 @@ static inline unsigned ublk_pos_to_tag(loff_t pos) static void ublk_dev_param_basic_apply(struct ublk_device *ub) { - struct request_queue *q = ub->ub_disk->queue; const struct ublk_param_basic *p = &ub->params.basic; - blk_queue_logical_block_size(q, 1 << p->logical_bs_shift); - blk_queue_physical_block_size(q, 1 << p->physical_bs_shift); - blk_queue_io_min(q, 1 << p->io_min_shift); - blk_queue_io_opt(q, 1 << p->io_opt_shift); - - blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE, - p->attrs & UBLK_ATTR_FUA); - if (p->attrs & UBLK_ATTR_ROTATIONAL) - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); - else - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - - blk_queue_max_hw_sectors(q, p->max_sectors); - blk_queue_chunk_sectors(q, p->chunk_sectors); - blk_queue_virt_boundary(q, p->virt_boundary_mask); - if (p->attrs & UBLK_ATTR_READ_ONLY) set_disk_ro(ub->ub_disk, true); set_capacity(ub->ub_disk, p->dev_sectors); } -static void ublk_dev_param_discard_apply(struct ublk_device *ub) -{ - struct request_queue *q = ub->ub_disk->queue; - const struct ublk_param_discard *p = &ub->params.discard; - - q->limits.discard_alignment = p->discard_alignment; - q->limits.discard_granularity = p->discard_granularity; - blk_queue_max_discard_sectors(q, p->max_discard_sectors); - blk_queue_max_write_zeroes_sectors(q, - p->max_write_zeroes_sectors); - blk_queue_max_discard_segments(q, p->max_discard_segments); -} - static int ublk_validate_params(const struct ublk_device *ub) { /* basic param is the only one which must be set */ @@ -576,20 +571,12 @@ static int ublk_validate_params(const struct ublk_device *ub) return 0; } -static int ublk_apply_params(struct ublk_device *ub) +static void ublk_apply_params(struct ublk_device *ub) { - if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC)) - return -EINVAL; - ublk_dev_param_basic_apply(ub); - if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) - ublk_dev_param_discard_apply(ub); - if (ub->params.types & UBLK_PARAM_TYPE_ZONED) - return ublk_dev_param_zoned_apply(ub); - - return 0; + ublk_dev_param_zoned_apply(ub); } static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) @@ -645,14 +632,16 @@ static inline bool ublk_need_get_data(const struct ublk_queue *ubq) return ubq->flags & UBLK_F_NEED_GET_DATA; } -static struct ublk_device *ublk_get_device(struct ublk_device *ub) +/* Called in slow path only, keep it noinline for trace purpose */ +static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub) { if (kobject_get_unless_zero(&ub->cdev_dev.kobj)) return ub; return NULL; } -static void ublk_put_device(struct ublk_device *ub) +/* Called in slow path only, keep it noinline for trace purpose */ +static noinline void ublk_put_device(struct ublk_device *ub) { put_device(&ub->cdev_dev); } @@ -680,30 +669,69 @@ static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) return ublk_get_queue(ub, q_id)->io_cmd_buf; } +static inline int __ublk_queue_cmd_buf_size(int depth) +{ + return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE); +} + static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) { struct ublk_queue *ubq = ublk_get_queue(ub, q_id); - return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc), - PAGE_SIZE); + return __ublk_queue_cmd_buf_size(ubq->q_depth); +} + +static int ublk_max_cmd_buf_size(void) +{ + return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH); +} + +/* + * Should I/O outstanding to the ublk server when it exits be reissued? + * If not, outstanding I/O will get errors. + */ +static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub) +{ + return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) && + (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE); +} + +/* + * Should I/O issued while there is no ublk server queue? If not, I/O + * issued while there is no ublk server will get errors. + */ +static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub) +{ + return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) && + !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO); } -static inline bool ublk_queue_can_use_recovery_reissue( - struct ublk_queue *ubq) +/* + * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy + * of the device flags for smaller cache footprint - better for fast + * paths. + */ +static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq) { return (ubq->flags & UBLK_F_USER_RECOVERY) && - (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE); + !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO); } -static inline bool ublk_queue_can_use_recovery( - struct ublk_queue *ubq) +/* + * Should ublk devices be stopped (i.e. no recovery possible) when the + * ublk server exits? If not, devices can be used again by a future + * incarnation of a ublk server via the start_recovery/end_recovery + * commands. + */ +static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub) { - return ubq->flags & UBLK_F_USER_RECOVERY; + return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY); } -static inline bool ublk_can_use_recovery(struct ublk_device *ub) +static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub) { - return ub->dev_info.flags & UBLK_F_USER_RECOVERY; + return ub->dev_info.state == UBLK_S_DEV_QUIESCED || + ub->dev_info.state == UBLK_S_DEV_FAIL_IO; } static void ublk_free_disk(struct gendisk *disk) @@ -711,7 +739,7 @@ static void ublk_free_disk(struct gendisk *disk) struct ublk_device *ub = disk->private_data; clear_bit(UB_STATE_USED, &ub->state); - put_device(&ub->cdev_dev); + ublk_put_device(ub); } static void ublk_store_owner_uid_gid(unsigned int *owner_uid, @@ -1079,7 +1107,7 @@ static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, { WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); - if (ublk_queue_can_use_recovery_reissue(ubq)) + if (ublk_nosrv_should_reissue_outstanding(ubq->dev)) blk_mq_requeue_request(req, false); else ublk_put_req_ref(ubq, req); @@ -1107,7 +1135,7 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, struct request *rq) { /* We cannot process this rq so just requeue it. */ - if (ublk_queue_can_use_recovery(ubq)) + if (ublk_nosrv_dev_should_queue_io(ubq->dev)) blk_mq_requeue_request(rq, false); else blk_mq_end_request(rq, BLK_STS_IOERR); @@ -1252,10 +1280,7 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq) struct ublk_device *ub = ubq->dev; if (ublk_abort_requests(ub, ubq)) { - if (ublk_can_use_recovery(ub)) - schedule_work(&ub->quiesce_work); - else - schedule_work(&ub->stop_work); + schedule_work(&ub->nosrv_work); } return BLK_EH_DONE; } @@ -1270,6 +1295,10 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq = bd->rq; blk_status_t res; + if (unlikely(ubq->fail_io)) { + return BLK_STS_TARGET; + } + /* fill iod to slot in io cmd buffer */ res = ublk_setup_iod(ubq, rq); if (unlikely(res != BLK_STS_OK)) @@ -1284,7 +1313,7 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, * Note: force_abort is guaranteed to be seen because it is set * before request queue is unqiuesced. */ - if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort)) + if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort)) return BLK_STS_IOERR; if (unlikely(ubq->canceling)) { @@ -1338,7 +1367,7 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) { struct ublk_device *ub = filp->private_data; size_t sz = vma->vm_end - vma->vm_start; - unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc); + unsigned max_sz = ublk_max_cmd_buf_size(); unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT; int q_id, ret = 0; @@ -1505,10 +1534,7 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, ublk_cancel_cmd(ubq, io, issue_flags); if (need_schedule) { - if (ublk_can_use_recovery(ub)) - schedule_work(&ub->quiesce_work); - else - schedule_work(&ub->stop_work); + schedule_work(&ub->nosrv_work); } } @@ -1571,20 +1597,6 @@ static void __ublk_quiesce_dev(struct ublk_device *ub) ub->dev_info.state = UBLK_S_DEV_QUIESCED; } -static void ublk_quiesce_work_fn(struct work_struct *work) -{ - struct ublk_device *ub = - container_of(work, struct ublk_device, quiesce_work); - - mutex_lock(&ub->mutex); - if (ub->dev_info.state != UBLK_S_DEV_LIVE) - goto unlock; - __ublk_quiesce_dev(ub); - unlock: - mutex_unlock(&ub->mutex); - ublk_cancel_dev(ub); -} - static void ublk_unquiesce_dev(struct ublk_device *ub) { int i; @@ -1606,6 +1618,21 @@ static void ublk_unquiesce_dev(struct ublk_device *ub) blk_mq_kick_requeue_list(ub->ub_disk->queue); } +static struct gendisk *ublk_detach_disk(struct ublk_device *ub) +{ + struct gendisk *disk; + + /* Sync with ublk_abort_queue() by holding the lock */ + spin_lock(&ub->lock); + disk = ub->ub_disk; + ub->dev_info.state = UBLK_S_DEV_DEAD; + ub->dev_info.ublksrv_pid = -1; + ub->ub_disk = NULL; + spin_unlock(&ub->lock); + + return disk; +} + static void ublk_stop_dev(struct ublk_device *ub) { struct gendisk *disk; @@ -1613,26 +1640,50 @@ static void ublk_stop_dev(struct ublk_device *ub) mutex_lock(&ub->mutex); if (ub->dev_info.state == UBLK_S_DEV_DEAD) goto unlock; - if (ublk_can_use_recovery(ub)) { + if (ublk_nosrv_dev_should_queue_io(ub)) { if (ub->dev_info.state == UBLK_S_DEV_LIVE) __ublk_quiesce_dev(ub); ublk_unquiesce_dev(ub); } del_gendisk(ub->ub_disk); - - /* Sync with ublk_abort_queue() by holding the lock */ - spin_lock(&ub->lock); - disk = ub->ub_disk; - ub->dev_info.state = UBLK_S_DEV_DEAD; - ub->dev_info.ublksrv_pid = -1; - ub->ub_disk = NULL; - spin_unlock(&ub->lock); + disk = ublk_detach_disk(ub); put_disk(disk); unlock: mutex_unlock(&ub->mutex); ublk_cancel_dev(ub); } +static void ublk_nosrv_work(struct work_struct *work) +{ + struct ublk_device *ub = + container_of(work, struct ublk_device, nosrv_work); + int i; + + if (ublk_nosrv_should_stop_dev(ub)) { + ublk_stop_dev(ub); + return; + } + + mutex_lock(&ub->mutex); + if (ub->dev_info.state != UBLK_S_DEV_LIVE) + goto unlock; + + if (ublk_nosrv_dev_should_queue_io(ub)) { + __ublk_quiesce_dev(ub); + } else { + blk_mq_quiesce_queue(ub->ub_disk->queue); + ub->dev_info.state = UBLK_S_DEV_FAIL_IO; + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + ublk_get_queue(ub, i)->fail_io = true; + } + blk_mq_unquiesce_queue(ub->ub_disk->queue); + } + + unlock: + mutex_unlock(&ub->mutex); + ublk_cancel_dev(ub); +} + /* device can only be started after all IOs are ready */ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) { @@ -1999,7 +2050,6 @@ static const struct file_operations ublk_ch_fops = { .owner = THIS_MODULE, .open = ublk_ch_open, .release = ublk_ch_release, - .llseek = no_llseek, .read_iter = ublk_ch_read_iter, .write_iter = ublk_ch_write_iter, .uring_cmd = ublk_ch_uring_cmd, @@ -2147,14 +2197,6 @@ static int ublk_add_chdev(struct ublk_device *ub) return ret; } -static void ublk_stop_work_fn(struct work_struct *work) -{ - struct ublk_device *ub = - container_of(work, struct ublk_device, stop_work); - - ublk_stop_dev(ub); -} - /* align max io buffer size with PAGE_SIZE */ static void ublk_align_max_io_size(struct ublk_device *ub) { @@ -2171,7 +2213,6 @@ static int ublk_add_tag_set(struct ublk_device *ub) ub->tag_set.queue_depth = ub->dev_info.queue_depth; ub->tag_set.numa_node = NUMA_NO_NODE; ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); - ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; ub->tag_set.driver_data = ub; return blk_mq_alloc_tag_set(&ub->tag_set); } @@ -2179,10 +2220,9 @@ static int ublk_add_tag_set(struct ublk_device *ub) static void ublk_remove(struct ublk_device *ub) { ublk_stop_dev(ub); - cancel_work_sync(&ub->stop_work); - cancel_work_sync(&ub->quiesce_work); + cancel_work_sync(&ub->nosrv_work); cdev_device_del(&ub->cdev, &ub->cdev_dev); - put_device(&ub->cdev_dev); + ublk_put_device(ub); ublks_added--; } @@ -2205,12 +2245,58 @@ static struct ublk_device *ublk_get_device_from_id(int idx) static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) { const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); + const struct ublk_param_basic *p = &ub->params.basic; int ublksrv_pid = (int)header->data[0]; + struct queue_limits lim = { + .logical_block_size = 1 << p->logical_bs_shift, + .physical_block_size = 1 << p->physical_bs_shift, + .io_min = 1 << p->io_min_shift, + .io_opt = 1 << p->io_opt_shift, + .max_hw_sectors = p->max_sectors, + .chunk_sectors = p->chunk_sectors, + .virt_boundary_mask = p->virt_boundary_mask, + .max_segments = USHRT_MAX, + .max_segment_size = UINT_MAX, + .dma_alignment = 3, + }; struct gendisk *disk; int ret = -EINVAL; if (ublksrv_pid <= 0) return -EINVAL; + if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC)) + return -EINVAL; + + if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) { + const struct ublk_param_discard *pd = &ub->params.discard; + + lim.discard_alignment = pd->discard_alignment; + lim.discard_granularity = pd->discard_granularity; + lim.max_hw_discard_sectors = pd->max_discard_sectors; + lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors; + lim.max_discard_segments = pd->max_discard_segments; + } + + if (ub->params.types & UBLK_PARAM_TYPE_ZONED) { + const struct ublk_param_zoned *p = &ub->params.zoned; + + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) + return -EOPNOTSUPP; + + lim.features |= BLK_FEAT_ZONED; + lim.max_active_zones = p->max_active_zones; + lim.max_open_zones = p->max_open_zones; + lim.max_hw_zone_append_sectors = p->max_zone_append_sectors; + } + + if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) { + lim.features |= BLK_FEAT_WRITE_CACHE; + if (ub->params.basic.attrs & UBLK_ATTR_FUA) + lim.features |= BLK_FEAT_FUA; + } + + if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL) + lim.features |= BLK_FEAT_ROTATIONAL; if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; @@ -2222,7 +2308,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) goto out_unlock; } - disk = blk_mq_alloc_disk(&ub->tag_set, NULL); + disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL); if (IS_ERR(disk)) { ret = PTR_ERR(disk); goto out_unlock; @@ -2234,15 +2320,13 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) ub->dev_info.ublksrv_pid = ublksrv_pid; ub->ub_disk = disk; - ret = ublk_apply_params(ub); - if (ret) - goto out_put_disk; + ublk_apply_params(ub); /* don't probe partitions if any one ubq daemon is un-trusted */ if (ub->nr_privileged_daemon != ub->nr_queues_ready) set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); - get_device(&ub->cdev_dev); + ublk_get_device(ub); ub->dev_info.state = UBLK_S_DEV_LIVE; if (ublk_dev_is_zoned(ub)) { @@ -2259,10 +2343,9 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) out_put_cdev: if (ret) { - ub->dev_info.state = UBLK_S_DEV_DEAD; + ublk_detach_disk(ub); ublk_put_device(ub); } -out_put_disk: if (ret) put_disk(disk); out_unlock: @@ -2346,6 +2429,19 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) return -EPERM; + /* forbid nonsense combinations of recovery flags */ + switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) { + case 0: + case UBLK_F_USER_RECOVERY: + case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE): + case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO): + break; + default: + pr_warn("%s: invalid recovery flags %llx\n", __func__, + info.flags & UBLK_F_ALL_RECOVERY_FLAGS); + return -EINVAL; + } + /* * unprivileged device can't be trusted, but RECOVERY and * RECOVERY_REISSUE still may hang error handling, so can't @@ -2354,10 +2450,19 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) * TODO: provide forward progress for RECOVERY handler, so that * unprivileged device can benefit from it */ - if (info.flags & UBLK_F_UNPRIVILEGED_DEV) + if (info.flags & UBLK_F_UNPRIVILEGED_DEV) { info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE | UBLK_F_USER_RECOVERY); + /* + * For USER_COPY, we depends on userspace to fill request + * buffer by pwrite() to ublk char device, which can't be + * used for unprivileged device + */ + if (info.flags & UBLK_F_USER_COPY) + return -EINVAL; + } + /* the created device is always owned by current user */ ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); @@ -2389,8 +2494,7 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) goto out_unlock; mutex_init(&ub->mutex); spin_lock_init(&ub->lock); - INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn); - INIT_WORK(&ub->stop_work, ublk_stop_work_fn); + INIT_WORK(&ub->nosrv_work, ublk_nosrv_work); ret = ublk_alloc_dev_number(ub, header->dev_id); if (ret < 0) @@ -2474,7 +2578,7 @@ static inline bool ublk_idr_freed(int id) return ptr == NULL; } -static int ublk_ctrl_del_dev(struct ublk_device **p_ub) +static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait) { struct ublk_device *ub = *p_ub; int idx = ub->ub_number; @@ -2508,7 +2612,7 @@ static int ublk_ctrl_del_dev(struct ublk_device **p_ub) * - the device number is freed already, we will not find this * device via ublk_get_device_from_id() */ - if (wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) + if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) return -EINTR; return 0; } @@ -2525,9 +2629,7 @@ static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) static int ublk_ctrl_stop_dev(struct ublk_device *ub) { ublk_stop_dev(ub); - cancel_work_sync(&ub->stop_work); - cancel_work_sync(&ub->quiesce_work); - + cancel_work_sync(&ub->nosrv_work); return 0; } @@ -2613,9 +2715,12 @@ static int ublk_ctrl_set_params(struct ublk_device *ub, if (ph.len > sizeof(struct ublk_params)) ph.len = sizeof(struct ublk_params); - /* parameters can only be changed when device isn't live */ mutex_lock(&ub->mutex); - if (ub->dev_info.state == UBLK_S_DEV_LIVE) { + if (test_bit(UB_STATE_USED, &ub->state)) { + /* + * Parameters can only be changed when device hasn't + * been started yet + */ ret = -EACCES; } else if (copy_from_user(&ub->params, argp, ph.len)) { ret = -EFAULT; @@ -2664,7 +2769,9 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub, int i; mutex_lock(&ub->mutex); - if (!ublk_can_use_recovery(ub)) + if (ublk_nosrv_should_stop_dev(ub)) + goto out_unlock; + if (!ub->nr_queues_ready) goto out_unlock; /* * START_RECOVERY is only allowd after: @@ -2673,14 +2780,18 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub, * and related io_uring ctx is freed so file struct of /dev/ublkcX is * released. * + * and one of the following holds + * * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work: * (a)has quiesced request queue * (b)has requeued every inflight rqs whose io_flags is ACTIVE * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE * (d)has completed/camceled all ioucmds owned by ther dying process + * + * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not + * quiesced, but all I/O is being immediately errored */ - if (test_bit(UB_STATE_OPEN, &ub->state) || - ub->dev_info.state != UBLK_S_DEV_QUIESCED) { + if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) { ret = -EBUSY; goto out_unlock; } @@ -2704,6 +2815,7 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); int ublksrv_pid = (int)header->data[0]; int ret = -EINVAL; + int i; pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", __func__, ub->dev_info.nr_hw_queues, header->dev_id); @@ -2715,21 +2827,32 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, __func__, ub->dev_info.nr_hw_queues, header->dev_id); mutex_lock(&ub->mutex); - if (!ublk_can_use_recovery(ub)) + if (ublk_nosrv_should_stop_dev(ub)) goto out_unlock; - if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) { + if (!ublk_dev_in_recoverable_state(ub)) { ret = -EBUSY; goto out_unlock; } ub->dev_info.ublksrv_pid = ublksrv_pid; pr_devel("%s: new ublksrv_pid %d, dev id %d\n", __func__, ublksrv_pid, header->dev_id); - blk_mq_unquiesce_queue(ub->ub_disk->queue); - pr_devel("%s: queue unquiesced, dev id %d.\n", - __func__, header->dev_id); - blk_mq_kick_requeue_list(ub->ub_disk->queue); - ub->dev_info.state = UBLK_S_DEV_LIVE; + + if (ublk_nosrv_dev_should_queue_io(ub)) { + ub->dev_info.state = UBLK_S_DEV_LIVE; + blk_mq_unquiesce_queue(ub->ub_disk->queue); + pr_devel("%s: queue unquiesced, dev id %d.\n", + __func__, header->dev_id); + blk_mq_kick_requeue_list(ub->ub_disk->queue); + } else { + blk_mq_quiesce_queue(ub->ub_disk->queue); + ub->dev_info.state = UBLK_S_DEV_LIVE; + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + ublk_get_queue(ub, i)->fail_io = false; + } + blk_mq_unquiesce_queue(ub->ub_disk->queue); + } + ret = 0; out_unlock: mutex_unlock(&ub->mutex); @@ -2907,7 +3030,10 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_ctrl_add_dev(cmd); break; case UBLK_CMD_DEL_DEV: - ret = ublk_ctrl_del_dev(&ub); + ret = ublk_ctrl_del_dev(&ub, true); + break; + case UBLK_CMD_DEL_DEV_ASYNC: + ret = ublk_ctrl_del_dev(&ub, false); break; case UBLK_CMD_GET_QUEUE_AFFINITY: ret = ublk_ctrl_get_queue_affinity(ub, cmd); @@ -2925,7 +3051,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_ctrl_end_recovery(ub, cmd); break; default: - ret = -ENOTSUPP; + ret = -EOPNOTSUPP; break; } @@ -3019,4 +3145,5 @@ module_param_cb(ublks_max, &ublk_max_ublks_ops, &ublks_max, 0644); MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)"); MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>"); +MODULE_DESCRIPTION("Userspace block device"); MODULE_LICENSE("GPL"); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 2bf14a0e2815..91cde76a4b3e 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -13,7 +13,6 @@ #include <linux/string_helpers.h> #include <linux/idr.h> #include <linux/blk-mq.h> -#include <linux/blk-mq-virtio.h> #include <linux/numa.h> #include <linux/vmalloc.h> #include <uapi/linux/virtio_ring.h> @@ -471,18 +470,18 @@ static bool virtblk_prep_rq_batch(struct request *req) return virtblk_prep_rq(req->mq_hctx, vblk, req, vbr) == BLK_STS_OK; } -static bool virtblk_add_req_batch(struct virtio_blk_vq *vq, - struct request **rqlist) +static void virtblk_add_req_batch(struct virtio_blk_vq *vq, + struct rq_list *rqlist) { + struct request *req; unsigned long flags; - int err; bool kick; spin_lock_irqsave(&vq->lock, flags); - while (!rq_list_empty(*rqlist)) { - struct request *req = rq_list_pop(rqlist); + while ((req = rq_list_pop(rqlist))) { struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + int err; err = virtblk_add_req(vq->vq, vbr); if (err) { @@ -495,37 +494,32 @@ static bool virtblk_add_req_batch(struct virtio_blk_vq *vq, kick = virtqueue_kick_prepare(vq->vq); spin_unlock_irqrestore(&vq->lock, flags); - return kick; + if (kick) + virtqueue_notify(vq->vq); } -static void virtio_queue_rqs(struct request **rqlist) +static void virtio_queue_rqs(struct rq_list *rqlist) { - struct request *req, *next, *prev = NULL; - struct request *requeue_list = NULL; - - rq_list_for_each_safe(rqlist, req, next) { - struct virtio_blk_vq *vq = get_virtio_blk_vq(req->mq_hctx); - bool kick; - - if (!virtblk_prep_rq_batch(req)) { - rq_list_move(rqlist, &requeue_list, req, prev); - req = prev; - if (!req) - continue; - } + struct rq_list submit_list = { }; + struct rq_list requeue_list = { }; + struct virtio_blk_vq *vq = NULL; + struct request *req; - if (!next || req->mq_hctx != next->mq_hctx) { - req->rq_next = NULL; - kick = virtblk_add_req_batch(vq, rqlist); - if (kick) - virtqueue_notify(vq->vq); + while ((req = rq_list_pop(rqlist))) { + struct virtio_blk_vq *this_vq = get_virtio_blk_vq(req->mq_hctx); - *rqlist = next; - prev = NULL; - } else - prev = req; + if (vq && vq != this_vq) + virtblk_add_req_batch(vq, &submit_list); + vq = this_vq; + + if (virtblk_prep_rq_batch(req)) + rq_list_add_tail(&submit_list, req); + else + rq_list_add_tail(&requeue_list, req); } + if (vq) + virtblk_add_req_batch(vq, &submit_list); *rqlist = requeue_list; } @@ -720,25 +714,24 @@ fail_report: return ret; } -static int virtblk_probe_zoned_device(struct virtio_device *vdev, - struct virtio_blk *vblk, - struct request_queue *q) +static int virtblk_read_zoned_limits(struct virtio_blk *vblk, + struct queue_limits *lim) { + struct virtio_device *vdev = vblk->vdev; u32 v, wg; dev_dbg(&vdev->dev, "probing host-managed zoned device\n"); - disk_set_zoned(vblk->disk); - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); + lim->features |= BLK_FEAT_ZONED; virtio_cread(vdev, struct virtio_blk_config, zoned.max_open_zones, &v); - disk_set_max_open_zones(vblk->disk, v); + lim->max_open_zones = v; dev_dbg(&vdev->dev, "max open zones = %u\n", v); virtio_cread(vdev, struct virtio_blk_config, zoned.max_active_zones, &v); - disk_set_max_active_zones(vblk->disk, v); + lim->max_active_zones = v; dev_dbg(&vdev->dev, "max active zones = %u\n", v); virtio_cread(vdev, struct virtio_blk_config, @@ -747,8 +740,8 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev, dev_warn(&vdev->dev, "zero write granularity reported\n"); return -ENODEV; } - blk_queue_physical_block_size(q, wg); - blk_queue_io_min(q, wg); + lim->physical_block_size = wg; + lim->io_min = wg; dev_dbg(&vdev->dev, "write granularity = %u\n", wg); @@ -764,13 +757,13 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev, vblk->zone_sectors); return -ENODEV; } - blk_queue_chunk_sectors(q, vblk->zone_sectors); + lim->chunk_sectors = vblk->zone_sectors; dev_dbg(&vdev->dev, "zone sectors = %u\n", vblk->zone_sectors); if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { dev_warn(&vblk->vdev->dev, "ignoring negotiated F_DISCARD for zoned device\n"); - blk_queue_max_discard_sectors(q, 0); + lim->max_hw_discard_sectors = 0; } virtio_cread(vdev, struct virtio_blk_config, @@ -785,25 +778,21 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev, wg, v); return -ENODEV; } - blk_queue_max_zone_append_sectors(q, v); + lim->max_hw_zone_append_sectors = v; dev_dbg(&vdev->dev, "max append sectors = %u\n", v); - return blk_revalidate_disk_zones(vblk->disk, NULL); + return 0; } - #else - /* - * Zoned block device support is not configured in this kernel. - * Host-managed zoned devices can't be supported, but others are - * good to go as regular block devices. + * Zoned block device support is not configured in this kernel, host-managed + * zoned devices can't be supported. */ #define virtblk_report_zones NULL - -static inline int virtblk_probe_zoned_device(struct virtio_device *vdev, - struct virtio_blk *vblk, struct request_queue *q) +static inline int virtblk_read_zoned_limits(struct virtio_blk *vblk, + struct queue_limits *lim) { - dev_err(&vdev->dev, + dev_err(&vblk->vdev->dev, "virtio_blk: zoned devices are not supported"); return -EOPNOTSUPP; } @@ -969,8 +958,7 @@ static int init_vq(struct virtio_blk *vblk) { int err; unsigned short i; - vq_callback_t **callbacks; - const char **names; + struct virtqueue_info *vqs_info; struct virtqueue **vqs; unsigned short num_vqs; unsigned short num_poll_vqs; @@ -1007,28 +995,26 @@ static int init_vq(struct virtio_blk *vblk) if (!vblk->vqs) return -ENOMEM; - names = kmalloc_array(num_vqs, sizeof(*names), GFP_KERNEL); - callbacks = kmalloc_array(num_vqs, sizeof(*callbacks), GFP_KERNEL); + vqs_info = kcalloc(num_vqs, sizeof(*vqs_info), GFP_KERNEL); vqs = kmalloc_array(num_vqs, sizeof(*vqs), GFP_KERNEL); - if (!names || !callbacks || !vqs) { + if (!vqs_info || !vqs) { err = -ENOMEM; goto out; } for (i = 0; i < num_vqs - num_poll_vqs; i++) { - callbacks[i] = virtblk_done; + vqs_info[i].callback = virtblk_done; snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%u", i); - names[i] = vblk->vqs[i].name; + vqs_info[i].name = vblk->vqs[i].name; } for (; i < num_vqs; i++) { - callbacks[i] = NULL; snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%u", i); - names[i] = vblk->vqs[i].name; + vqs_info[i].name = vblk->vqs[i].name; } /* Discover virtqueues and write information to configuration. */ - err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc); + err = virtio_find_vqs(vdev, num_vqs, vqs, vqs_info, &desc); if (err) goto out; @@ -1040,8 +1026,7 @@ static int init_vq(struct virtio_blk *vblk) out: kfree(vqs); - kfree(callbacks); - kfree(names); + kfree(vqs_info); if (err) kfree(vblk->vqs); return err; @@ -1094,14 +1079,6 @@ static int virtblk_get_cache_mode(struct virtio_device *vdev) return writeback; } -static void virtblk_update_cache_mode(struct virtio_device *vdev) -{ - u8 writeback = virtblk_get_cache_mode(vdev); - struct virtio_blk *vblk = vdev->priv; - - blk_queue_write_cache(vblk->disk->queue, writeback, false); -} - static const char *const virtblk_cache_types[] = { "write through", "write back" }; @@ -1113,6 +1090,7 @@ cache_type_store(struct device *dev, struct device_attribute *attr, struct gendisk *disk = dev_to_disk(dev); struct virtio_blk *vblk = disk->private_data; struct virtio_device *vdev = vblk->vdev; + struct queue_limits lim; int i; BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE)); @@ -1121,7 +1099,15 @@ cache_type_store(struct device *dev, struct device_attribute *attr, return i; virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i); - virtblk_update_cache_mode(vdev); + + lim = queue_limits_start_update(disk->queue); + if (virtblk_get_cache_mode(vdev)) + lim.features |= BLK_FEAT_WRITE_CACHE; + else + lim.features &= ~BLK_FEAT_WRITE_CACHE; + i = queue_limits_commit_update_frozen(disk->queue, &lim); + if (i) + return i; return count; } @@ -1192,7 +1178,8 @@ static void virtblk_map_queues(struct blk_mq_tag_set *set) if (i == HCTX_TYPE_POLL) blk_mq_map_queues(&set->map[i]); else - blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0); + blk_mq_map_hw_queues(&set->map[i], + &vblk->vdev->dev, 0); } } @@ -1220,11 +1207,12 @@ static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) { struct request *req = blk_mq_rq_from_pdu(vbr); + u8 status = virtblk_vbr_status(vbr); found++; if (!blk_mq_complete_request_remote(req) && - !blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), - virtblk_complete_batch)) + !blk_mq_add_to_batch(req, iob, status != VIRTIO_BLK_S_OK, + virtblk_complete_batch)) virtblk_request_done(req); } @@ -1248,31 +1236,17 @@ static const struct blk_mq_ops virtio_mq_ops = { static unsigned int virtblk_queue_depth; module_param_named(queue_depth, virtblk_queue_depth, uint, 0444); -static int virtblk_probe(struct virtio_device *vdev) +static int virtblk_read_limits(struct virtio_blk *vblk, + struct queue_limits *lim) { - struct virtio_blk *vblk; - struct request_queue *q; - int err, index; - - u32 v, blk_size, max_size, sg_elems, opt_io_size; + struct virtio_device *vdev = vblk->vdev; + u32 v, max_size, sg_elems, opt_io_size; u32 max_discard_segs = 0; u32 discard_granularity = 0; u16 min_io_size; u8 physical_block_exp, alignment_offset; - unsigned int queue_depth; size_t max_dma_size; - - if (!vdev->config->get) { - dev_err(&vdev->dev, "%s failure: config access disabled\n", - __func__); - return -EINVAL; - } - - err = ida_alloc_range(&vd_index_ida, 0, - minor_to_index(1 << MINORBITS) - 1, GFP_KERNEL); - if (err < 0) - goto out; - index = err; + int err; /* We need to know how many segments before we allocate. */ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX, @@ -1286,78 +1260,11 @@ static int virtblk_probe(struct virtio_device *vdev) /* Prevent integer overflows and honor max vq size */ sg_elems = min_t(u32, sg_elems, VIRTIO_BLK_MAX_SG_ELEMS - 2); - vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); - if (!vblk) { - err = -ENOMEM; - goto out_free_index; - } - - mutex_init(&vblk->vdev_mutex); - - vblk->vdev = vdev; - - INIT_WORK(&vblk->config_work, virtblk_config_changed_work); - - err = init_vq(vblk); - if (err) - goto out_free_vblk; - - /* Default queue sizing is to fill the ring. */ - if (!virtblk_queue_depth) { - queue_depth = vblk->vqs[0].vq->num_free; - /* ... but without indirect descs, we use 2 descs per req */ - if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) - queue_depth /= 2; - } else { - queue_depth = virtblk_queue_depth; - } - - memset(&vblk->tag_set, 0, sizeof(vblk->tag_set)); - vblk->tag_set.ops = &virtio_mq_ops; - vblk->tag_set.queue_depth = queue_depth; - vblk->tag_set.numa_node = NUMA_NO_NODE; - vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; - vblk->tag_set.cmd_size = - sizeof(struct virtblk_req) + - sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT; - vblk->tag_set.driver_data = vblk; - vblk->tag_set.nr_hw_queues = vblk->num_vqs; - vblk->tag_set.nr_maps = 1; - if (vblk->io_queues[HCTX_TYPE_POLL]) - vblk->tag_set.nr_maps = 3; - - err = blk_mq_alloc_tag_set(&vblk->tag_set); - if (err) - goto out_free_vq; - - vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, vblk); - if (IS_ERR(vblk->disk)) { - err = PTR_ERR(vblk->disk); - goto out_free_tags; - } - q = vblk->disk->queue; - - virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); - - vblk->disk->major = major; - vblk->disk->first_minor = index_to_minor(index); - vblk->disk->minors = 1 << PART_BITS; - vblk->disk->private_data = vblk; - vblk->disk->fops = &virtblk_fops; - vblk->index = index; - - /* configure queue flush support */ - virtblk_update_cache_mode(vdev); - - /* If disk is read-only in the host, the guest should obey */ - if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) - set_disk_ro(vblk->disk, 1); - /* We can handle whatever the host told us to handle. */ - blk_queue_max_segments(q, sg_elems); + lim->max_segments = sg_elems; /* No real sector limit. */ - blk_queue_max_hw_sectors(q, UINT_MAX); + lim->max_hw_sectors = UINT_MAX; max_dma_size = virtio_max_dma_size(vdev); max_size = max_dma_size > U32_MAX ? U32_MAX : max_dma_size; @@ -1369,50 +1276,39 @@ static int virtblk_probe(struct virtio_device *vdev) if (!err) max_size = min(max_size, v); - blk_queue_max_segment_size(q, max_size); + lim->max_segment_size = max_size; /* Host can optionally specify the block size of the device */ - err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE, + virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE, struct virtio_blk_config, blk_size, - &blk_size); - if (!err) { - err = blk_validate_block_size(blk_size); - if (err) { - dev_err(&vdev->dev, - "virtio_blk: invalid block size: 0x%x\n", - blk_size); - goto out_cleanup_disk; - } - - blk_queue_logical_block_size(q, blk_size); - } else - blk_size = queue_logical_block_size(q); + &lim->logical_block_size); /* Use topology information if available */ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, struct virtio_blk_config, physical_block_exp, &physical_block_exp); if (!err && physical_block_exp) - blk_queue_physical_block_size(q, - blk_size * (1 << physical_block_exp)); + lim->physical_block_size = + lim->logical_block_size * (1 << physical_block_exp); err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, struct virtio_blk_config, alignment_offset, &alignment_offset); if (!err && alignment_offset) - blk_queue_alignment_offset(q, blk_size * alignment_offset); + lim->alignment_offset = + lim->logical_block_size * alignment_offset; err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, struct virtio_blk_config, min_io_size, &min_io_size); if (!err && min_io_size) - blk_queue_io_min(q, blk_size * min_io_size); + lim->io_min = lim->logical_block_size * min_io_size; err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, struct virtio_blk_config, opt_io_size, &opt_io_size); if (!err && opt_io_size) - blk_queue_io_opt(q, blk_size * opt_io_size); + lim->io_opt = lim->logical_block_size * opt_io_size; if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { virtio_cread(vdev, struct virtio_blk_config, @@ -1420,7 +1316,7 @@ static int virtblk_probe(struct virtio_device *vdev) virtio_cread(vdev, struct virtio_blk_config, max_discard_sectors, &v); - blk_queue_max_discard_sectors(q, v ? v : UINT_MAX); + lim->max_hw_discard_sectors = v ? v : UINT_MAX; virtio_cread(vdev, struct virtio_blk_config, max_discard_seg, &max_discard_segs); @@ -1429,7 +1325,7 @@ static int virtblk_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) { virtio_cread(vdev, struct virtio_blk_config, max_write_zeroes_sectors, &v); - blk_queue_max_write_zeroes_sectors(q, v ? v : UINT_MAX); + lim->max_write_zeroes_sectors = v ? v : UINT_MAX; } /* The discard and secure erase limits are combined since the Linux @@ -1455,8 +1351,7 @@ static int virtblk_probe(struct virtio_device *vdev) if (!v) { dev_err(&vdev->dev, "virtio_blk: secure_erase_sector_alignment can't be 0\n"); - err = -EINVAL; - goto out_cleanup_disk; + return -EINVAL; } discard_granularity = min_not_zero(discard_granularity, v); @@ -1470,11 +1365,10 @@ static int virtblk_probe(struct virtio_device *vdev) if (!v) { dev_err(&vdev->dev, "virtio_blk: max_secure_erase_sectors can't be 0\n"); - err = -EINVAL; - goto out_cleanup_disk; + return -EINVAL; } - blk_queue_max_secure_erase_sectors(q, v); + lim->max_secure_erase_sectors = v; virtio_cread(vdev, struct virtio_blk_config, max_secure_erase_seg, &v); @@ -1485,8 +1379,7 @@ static int virtblk_probe(struct virtio_device *vdev) if (!v) { dev_err(&vdev->dev, "virtio_blk: max_secure_erase_seg can't be 0\n"); - err = -EINVAL; - goto out_cleanup_disk; + return -EINVAL; } max_discard_segs = min_not_zero(max_discard_segs, v); @@ -1502,45 +1395,144 @@ static int virtblk_probe(struct virtio_device *vdev) if (!max_discard_segs) max_discard_segs = sg_elems; - blk_queue_max_discard_segments(q, - min(max_discard_segs, MAX_DISCARD_SEGMENTS)); + lim->max_discard_segments = + min(max_discard_segs, MAX_DISCARD_SEGMENTS); if (discard_granularity) - q->limits.discard_granularity = discard_granularity << SECTOR_SHIFT; + lim->discard_granularity = + discard_granularity << SECTOR_SHIFT; else - q->limits.discard_granularity = blk_size; + lim->discard_granularity = lim->logical_block_size; } - virtblk_update_capacity(vblk, false); - virtio_device_ready(vdev); - - /* - * All steps that follow use the VQs therefore they need to be - * placed after the virtio_device_ready() call above. - */ if (virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED)) { u8 model; - virtio_cread(vdev, struct virtio_blk_config, zoned.model, - &model); + virtio_cread(vdev, struct virtio_blk_config, zoned.model, &model); switch (model) { case VIRTIO_BLK_Z_NONE: case VIRTIO_BLK_Z_HA: - /* Present the host-aware device as non-zoned */ - break; + /* treat host-aware devices as non-zoned */ + return 0; case VIRTIO_BLK_Z_HM: - err = virtblk_probe_zoned_device(vdev, vblk, q); + err = virtblk_read_zoned_limits(vblk, lim); if (err) - goto out_cleanup_disk; + return err; break; default: - dev_err(&vdev->dev, "unsupported zone model %d\n", - model); - err = -EINVAL; - goto out_cleanup_disk; + dev_err(&vdev->dev, "unsupported zone model %d\n", model); + return -EINVAL; } } + return 0; +} + +static int virtblk_probe(struct virtio_device *vdev) +{ + struct virtio_blk *vblk; + struct queue_limits lim = { + .features = BLK_FEAT_ROTATIONAL, + .logical_block_size = SECTOR_SIZE, + }; + int err, index; + unsigned int queue_depth; + + if (!vdev->config->get) { + dev_err(&vdev->dev, "%s failure: config access disabled\n", + __func__); + return -EINVAL; + } + + err = ida_alloc_range(&vd_index_ida, 0, + minor_to_index(1 << MINORBITS) - 1, GFP_KERNEL); + if (err < 0) + goto out; + index = err; + + vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); + if (!vblk) { + err = -ENOMEM; + goto out_free_index; + } + + mutex_init(&vblk->vdev_mutex); + + vblk->vdev = vdev; + + INIT_WORK(&vblk->config_work, virtblk_config_changed_work); + + err = init_vq(vblk); + if (err) + goto out_free_vblk; + + /* Default queue sizing is to fill the ring. */ + if (!virtblk_queue_depth) { + queue_depth = vblk->vqs[0].vq->num_free; + /* ... but without indirect descs, we use 2 descs per req */ + if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) + queue_depth /= 2; + } else { + queue_depth = virtblk_queue_depth; + } + + memset(&vblk->tag_set, 0, sizeof(vblk->tag_set)); + vblk->tag_set.ops = &virtio_mq_ops; + vblk->tag_set.queue_depth = queue_depth; + vblk->tag_set.numa_node = NUMA_NO_NODE; + vblk->tag_set.cmd_size = + sizeof(struct virtblk_req) + + sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT; + vblk->tag_set.driver_data = vblk; + vblk->tag_set.nr_hw_queues = vblk->num_vqs; + vblk->tag_set.nr_maps = 1; + if (vblk->io_queues[HCTX_TYPE_POLL]) + vblk->tag_set.nr_maps = 3; + + err = blk_mq_alloc_tag_set(&vblk->tag_set); + if (err) + goto out_free_vq; + + err = virtblk_read_limits(vblk, &lim); + if (err) + goto out_free_tags; + + if (virtblk_get_cache_mode(vdev)) + lim.features |= BLK_FEAT_WRITE_CACHE; + + vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, &lim, vblk); + if (IS_ERR(vblk->disk)) { + err = PTR_ERR(vblk->disk); + goto out_free_tags; + } + + virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); + + vblk->disk->major = major; + vblk->disk->first_minor = index_to_minor(index); + vblk->disk->minors = 1 << PART_BITS; + vblk->disk->private_data = vblk; + vblk->disk->fops = &virtblk_fops; + vblk->index = index; + + /* If disk is read-only in the host, the guest should obey */ + if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) + set_disk_ro(vblk->disk, 1); + + virtblk_update_capacity(vblk, false); + virtio_device_ready(vdev); + + /* + * All steps that follow use the VQs therefore they need to be + * placed after the virtio_device_ready() call above. + */ + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + (lim.features & BLK_FEAT_ZONED)) { + err = blk_revalidate_disk_zones(vblk->disk); + if (err) + goto out_cleanup_disk; + } + err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups); if (err) goto out_cleanup_disk; @@ -1588,13 +1580,16 @@ static void virtblk_remove(struct virtio_device *vdev) put_disk(vblk->disk); } -#ifdef CONFIG_PM_SLEEP -static int virtblk_freeze(struct virtio_device *vdev) +static int virtblk_freeze_priv(struct virtio_device *vdev) { struct virtio_blk *vblk = vdev->priv; + struct request_queue *q = vblk->disk->queue; + unsigned int memflags; /* Ensure no requests in virtqueues before deleting vqs. */ - blk_mq_freeze_queue(vblk->disk->queue); + memflags = blk_mq_freeze_queue(q); + blk_mq_quiesce_queue_nowait(q); + blk_mq_unfreeze_queue(q, memflags); /* Ensure we don't receive any more interrupts */ virtio_reset_device(vdev); @@ -1608,7 +1603,7 @@ static int virtblk_freeze(struct virtio_device *vdev) return 0; } -static int virtblk_restore(struct virtio_device *vdev) +static int virtblk_restore_priv(struct virtio_device *vdev) { struct virtio_blk *vblk = vdev->priv; int ret; @@ -1618,12 +1613,33 @@ static int virtblk_restore(struct virtio_device *vdev) return ret; virtio_device_ready(vdev); + blk_mq_unquiesce_queue(vblk->disk->queue); - blk_mq_unfreeze_queue(vblk->disk->queue); return 0; } + +#ifdef CONFIG_PM_SLEEP +static int virtblk_freeze(struct virtio_device *vdev) +{ + return virtblk_freeze_priv(vdev); +} + +static int virtblk_restore(struct virtio_device *vdev) +{ + return virtblk_restore_priv(vdev); +} #endif +static int virtblk_reset_prepare(struct virtio_device *vdev) +{ + return virtblk_freeze_priv(vdev); +} + +static int virtblk_reset_done(struct virtio_device *vdev) +{ + return virtblk_restore_priv(vdev); +} + static const struct virtio_device_id id_table[] = { { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID }, { 0 }, @@ -1651,7 +1667,6 @@ static struct virtio_driver virtio_blk = { .feature_table_legacy = features_legacy, .feature_table_size_legacy = ARRAY_SIZE(features_legacy), .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, .id_table = id_table, .probe = virtblk_probe, .remove = virtblk_remove, @@ -1660,6 +1675,8 @@ static struct virtio_driver virtio_blk = { .freeze = virtblk_freeze, .restore = virtblk_restore, #endif + .reset_prepare = virtblk_reset_prepare, + .reset_done = virtblk_reset_done, }; static int __init virtio_blk_init(void) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 4defd7f387c7..a7c2b04ab943 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -465,7 +465,7 @@ static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif, } req->dev = vbd->pdevice; - req->bdev = vbd->bdev_handle->bdev; + req->bdev = file_bdev(vbd->bdev_file); rc = 0; out: @@ -544,7 +544,7 @@ static void print_stats(struct xen_blkif_ring *ring) ring->st_rd_req, ring->st_wr_req, ring->st_f_req, ring->st_ds_req, ring->persistent_gnt_c, max_pgrants); - ring->st_print = jiffies + msecs_to_jiffies(10 * 1000); + ring->st_print = jiffies + secs_to_jiffies(10); ring->st_rd_req = 0; ring->st_wr_req = 0; ring->st_oo_req = 0; @@ -969,7 +969,7 @@ static int dispatch_discard_io(struct xen_blkif_ring *ring, int err = 0; int status = BLKIF_RSP_OKAY; struct xen_blkif *blkif = ring->blkif; - struct block_device *bdev = blkif->vbd.bdev_handle->bdev; + struct block_device *bdev = file_bdev(blkif->vbd.bdev_file); struct phys_req preq; xen_blkif_get(blkif); @@ -1563,5 +1563,6 @@ static void __exit xen_blkif_fini(void) module_exit(xen_blkif_fini); +MODULE_DESCRIPTION("Virtual block device back-end driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("xen-backend:vbd"); diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 1432c83183d0..b427d54bc120 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -221,7 +221,7 @@ struct xen_vbd { unsigned char type; /* phys device that this vbd maps to. */ u32 pdevice; - struct bdev_handle *bdev_handle; + struct file *bdev_file; /* Cached size parameter. */ sector_t size; unsigned int flush_support:1; @@ -360,7 +360,7 @@ struct pending_req { }; -#define vbd_sz(_v) bdev_nr_sectors((_v)->bdev_handle->bdev) +#define vbd_sz(_v) bdev_nr_sectors(file_bdev((_v)->bdev_file)) #define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt)) #define xen_blkif_put(_b) \ diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index e34219ea2b05..0621878940ae 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -81,7 +81,7 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) int i; /* Not ready to connect? */ - if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev_handle) + if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev_file) return; /* Already connected? */ @@ -99,13 +99,12 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) return; } - err = sync_blockdev(blkif->vbd.bdev_handle->bdev); + err = sync_blockdev(file_bdev(blkif->vbd.bdev_file)); if (err) { xenbus_dev_error(blkif->be->dev, err, "block flush"); return; } - invalidate_inode_pages2( - blkif->vbd.bdev_handle->bdev->bd_inode->i_mapping); + invalidate_inode_pages2(blkif->vbd.bdev_file->f_mapping); for (i = 0; i < blkif->nr_rings; i++) { ring = &blkif->rings[i]; @@ -473,9 +472,9 @@ static void xenvbd_sysfs_delif(struct xenbus_device *dev) static void xen_vbd_free(struct xen_vbd *vbd) { - if (vbd->bdev_handle) - bdev_release(vbd->bdev_handle); - vbd->bdev_handle = NULL; + if (vbd->bdev_file) + fput(vbd->bdev_file); + vbd->bdev_file = NULL; } static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, @@ -483,7 +482,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, int cdrom) { struct xen_vbd *vbd; - struct bdev_handle *bdev_handle; + struct file *bdev_file; vbd = &blkif->vbd; vbd->handle = handle; @@ -492,17 +491,17 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, vbd->pdevice = MKDEV(major, minor); - bdev_handle = bdev_open_by_dev(vbd->pdevice, vbd->readonly ? + bdev_file = bdev_file_open_by_dev(vbd->pdevice, vbd->readonly ? BLK_OPEN_READ : BLK_OPEN_WRITE, NULL, NULL); - if (IS_ERR(bdev_handle)) { + if (IS_ERR(bdev_file)) { pr_warn("xen_vbd_create: device %08x could not be opened\n", vbd->pdevice); return -ENOENT; } - vbd->bdev_handle = bdev_handle; - if (vbd->bdev_handle->bdev->bd_disk == NULL) { + vbd->bdev_file = bdev_file; + if (file_bdev(vbd->bdev_file)->bd_disk == NULL) { pr_warn("xen_vbd_create: device %08x doesn't exist\n", vbd->pdevice); xen_vbd_free(vbd); @@ -510,14 +509,14 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, } vbd->size = vbd_sz(vbd); - if (cdrom || disk_to_cdi(vbd->bdev_handle->bdev->bd_disk)) + if (cdrom || disk_to_cdi(file_bdev(vbd->bdev_file)->bd_disk)) vbd->type |= VDISK_CDROM; - if (vbd->bdev_handle->bdev->bd_disk->flags & GENHD_FL_REMOVABLE) + if (file_bdev(vbd->bdev_file)->bd_disk->flags & GENHD_FL_REMOVABLE) vbd->type |= VDISK_REMOVABLE; - if (bdev_write_cache(bdev_handle->bdev)) + if (bdev_write_cache(file_bdev(bdev_file))) vbd->flush_support = true; - if (bdev_max_secure_erase_sectors(bdev_handle->bdev)) + if (bdev_max_secure_erase_sectors(file_bdev(bdev_file))) vbd->discard_secure = true; pr_debug("Successful creation of handle=%04x (dom=%u)\n", @@ -570,7 +569,7 @@ static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info struct xen_blkif *blkif = be->blkif; int err; int state = 0; - struct block_device *bdev = be->blkif->vbd.bdev_handle->bdev; + struct block_device *bdev = file_bdev(be->blkif->vbd.bdev_file); if (!xenbus_read_unsigned(dev->nodename, "discard-enable", 1)) return; @@ -932,7 +931,7 @@ again: } err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu", (unsigned long)bdev_logical_block_size( - be->blkif->vbd.bdev_handle->bdev)); + file_bdev(be->blkif->vbd.bdev_file))); if (err) { xenbus_dev_fatal(dev, err, "writing %s/sector-size", dev->nodename); @@ -940,7 +939,7 @@ again: } err = xenbus_printf(xbt, dev->nodename, "physical-sector-size", "%u", bdev_physical_block_size( - be->blkif->vbd.bdev_handle->bdev)); + file_bdev(be->blkif->vbd.bdev_file))); if (err) xenbus_dev_error(dev, err, "writing %s/physical-sector-size", dev->nodename); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 434fab306777..edcd08a9dcef 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -788,6 +788,11 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri * A barrier request a superset of FUA, so we can * implement it the same way. (It's also a FLUSH+FUA, * since it is guaranteed ordered WRT previous writes.) + * + * Note that can end up here with a FUA write and the + * flags cleared. This happens when the flag was + * run-time disabled after a failing I/O, and we'll + * simplify submit it as a normal write. */ if (info->feature_flush && info->feature_fua) ring_req->operation = @@ -795,8 +800,6 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri else if (info->feature_flush) ring_req->operation = BLKIF_OP_FLUSH_DISKCACHE; - else - ring_req->operation = 0; } ring_req->u.rw.nr_segments = num_grant; if (unlikely(require_extra_req)) { @@ -887,16 +890,6 @@ static inline void flush_requests(struct blkfront_ring_info *rinfo) notify_remote_via_irq(rinfo->irq); } -static inline bool blkif_request_flush_invalid(struct request *req, - struct blkfront_info *info) -{ - return (blk_rq_is_passthrough(req) || - ((req_op(req) == REQ_OP_FLUSH) && - !info->feature_flush) || - ((req->cmd_flags & REQ_FUA) && - !info->feature_fua)); -} - static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *qd) { @@ -908,12 +901,22 @@ static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx, rinfo = get_rinfo(info, qid); blk_mq_start_request(qd->rq); spin_lock_irqsave(&rinfo->ring_lock, flags); - if (RING_FULL(&rinfo->ring)) - goto out_busy; - if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info)) - goto out_err; + /* + * Check if the backend actually supports flushes. + * + * While the block layer won't send us flushes if we don't claim to + * support them, the Xen protocol allows the backend to revoke support + * at any time. That is of course a really bad idea and dangerous, but + * has been allowed for 10+ years. In that case we simply clear the + * flags, and directly return here for an empty flush and ignore the + * FUA flag later on. + */ + if (unlikely(req_op(qd->rq) == REQ_OP_FLUSH && !info->feature_flush)) + goto complete; + if (RING_FULL(&rinfo->ring)) + goto out_busy; if (blkif_queue_request(qd->rq, rinfo)) goto out_busy; @@ -921,14 +924,14 @@ static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx, spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_STS_OK; -out_err: - spin_unlock_irqrestore(&rinfo->ring_lock, flags); - return BLK_STS_IOERR; - out_busy: blk_mq_stop_hw_queue(hctx); spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_STS_DEV_RESOURCE; +complete: + spin_unlock_irqrestore(&rinfo->ring_lock, flags); + blk_mq_end_request(qd->rq, BLK_STS_OK); + return BLK_STS_OK; } static void blkif_complete_rq(struct request *rq) @@ -941,39 +944,41 @@ static const struct blk_mq_ops blkfront_mq_ops = { .complete = blkif_complete_rq, }; -static void blkif_set_queue_limits(struct blkfront_info *info) +static void blkif_set_queue_limits(const struct blkfront_info *info, + struct queue_limits *lim) { - struct request_queue *rq = info->rq; - struct gendisk *gd = info->gd; unsigned int segments = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; - blk_queue_flag_set(QUEUE_FLAG_VIRT, rq); - if (info->feature_discard) { - blk_queue_max_discard_sectors(rq, get_capacity(gd)); - rq->limits.discard_granularity = info->discard_granularity ?: - info->physical_sector_size; - rq->limits.discard_alignment = info->discard_alignment; + lim->max_hw_discard_sectors = UINT_MAX; + if (info->discard_granularity) + lim->discard_granularity = info->discard_granularity; + lim->discard_alignment = info->discard_alignment; if (info->feature_secdiscard) - blk_queue_max_secure_erase_sectors(rq, - get_capacity(gd)); + lim->max_secure_erase_sectors = UINT_MAX; + } + + if (info->feature_flush) { + lim->features |= BLK_FEAT_WRITE_CACHE; + if (info->feature_fua) + lim->features |= BLK_FEAT_FUA; } /* Hard sector size and max sectors impersonate the equiv. hardware. */ - blk_queue_logical_block_size(rq, info->sector_size); - blk_queue_physical_block_size(rq, info->physical_sector_size); - blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512); + lim->logical_block_size = info->sector_size; + lim->physical_block_size = info->physical_sector_size; + lim->max_hw_sectors = (segments * XEN_PAGE_SIZE) / 512; /* Each segment in a request is up to an aligned page in size. */ - blk_queue_segment_boundary(rq, PAGE_SIZE - 1); - blk_queue_max_segment_size(rq, PAGE_SIZE); + lim->seg_boundary_mask = PAGE_SIZE - 1; + lim->max_segment_size = PAGE_SIZE; /* Ensure a merged request will fit in a single I/O ring slot. */ - blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG); + lim->max_segments = segments / GRANTS_PER_PSEG; /* Make sure buffer addresses are sector-aligned. */ - blk_queue_dma_alignment(rq, 511); + lim->dma_alignment = 511; } static const char *flush_info(struct blkfront_info *info) @@ -988,8 +993,6 @@ static const char *flush_info(struct blkfront_info *info) static void xlvbd_flush(struct blkfront_info *info) { - blk_queue_write_cache(info->rq, info->feature_flush ? true : false, - info->feature_fua ? true : false); pr_info("blkfront: %s: %s %s %s %s %s %s %s\n", info->gd->disk_name, flush_info(info), "persistent grants:", info->feature_persistent ? @@ -1067,9 +1070,9 @@ static char *encode_disk_name(char *ptr, unsigned int n) } static int xlvbd_alloc_gendisk(blkif_sector_t capacity, - struct blkfront_info *info, u16 sector_size, - unsigned int physical_sector_size) + struct blkfront_info *info) { + struct queue_limits lim = {}; struct gendisk *gd; int nr_minors = 1; int err; @@ -1128,7 +1131,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, } else info->tag_set.queue_depth = BLK_RING_SIZE(info); info->tag_set.numa_node = NUMA_NO_NODE; - info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; info->tag_set.cmd_size = sizeof(struct blkif_req); info->tag_set.driver_data = info; @@ -1136,7 +1138,8 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, if (err) goto out_release_minors; - gd = blk_mq_alloc_disk(&info->tag_set, info); + blkif_set_queue_limits(info, &lim); + gd = blk_mq_alloc_disk(&info->tag_set, &lim, info); if (IS_ERR(gd)) { err = PTR_ERR(gd); goto out_free_tag_set; @@ -1160,9 +1163,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, info->rq = gd->queue; info->gd = gd; - info->sector_size = sector_size; - info->physical_sector_size = physical_sector_size; - blkif_set_queue_limits(info); xlvbd_flush(info); @@ -1607,8 +1607,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) blkif_req(req)->error = BLK_STS_NOTSUPP; info->feature_discard = 0; info->feature_secdiscard = 0; - blk_queue_max_discard_sectors(rq, 0); - blk_queue_max_secure_erase_sectors(rq, 0); + blk_queue_disable_discard(rq); + blk_queue_disable_secure_erase(rq); } break; case BLKIF_OP_FLUSH_DISKCACHE: @@ -1629,7 +1629,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) blkif_req(req)->error = BLK_STS_OK; info->feature_fua = 0; info->feature_flush = 0; - xlvbd_flush(info); } fallthrough; case BLKIF_OP_READ: @@ -2006,18 +2005,19 @@ static int blkfront_probe(struct xenbus_device *dev, static int blkif_recover(struct blkfront_info *info) { + struct queue_limits lim; unsigned int r_index; struct request *req, *n; int rc; struct bio *bio; - unsigned int segs; struct blkfront_ring_info *rinfo; + lim = queue_limits_start_update(info->rq); blkfront_gather_backend_features(info); - /* Reset limits changed by blk_mq_update_nr_hw_queues(). */ - blkif_set_queue_limits(info); - segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; - blk_queue_max_segments(info->rq, segs / GRANTS_PER_PSEG); + blkif_set_queue_limits(info, &lim); + rc = queue_limits_commit_update(info->rq, &lim); + if (rc) + return rc; for_each_rinfo(info, rinfo, r_index) { rc = blkfront_setup_indirect(rinfo); @@ -2037,7 +2037,9 @@ static int blkif_recover(struct blkfront_info *info) list_for_each_entry_safe(req, n, &info->requests, queuelist) { /* Requeue pending requests (flush or discard) */ list_del_init(&req->queuelist); - BUG_ON(req->nr_phys_segments > segs); + BUG_ON(req->nr_phys_segments > + (info->max_indirect_segments ? : + BLKIF_MAX_SEGMENTS_PER_REQUEST)); blk_mq_requeue_request(req, false); } blk_mq_start_stopped_hw_queues(info->rq, true); @@ -2314,8 +2316,6 @@ static void blkfront_gather_backend_features(struct blkfront_info *info) static void blkfront_connect(struct blkfront_info *info) { unsigned long long sectors; - unsigned long sector_size; - unsigned int physical_sector_size; int err, i; struct blkfront_ring_info *rinfo; @@ -2354,7 +2354,7 @@ static void blkfront_connect(struct blkfront_info *info) err = xenbus_gather(XBT_NIL, info->xbdev->otherend, "sectors", "%llu", §ors, "info", "%u", &info->vdisk_info, - "sector-size", "%lu", §or_size, + "sector-size", "%lu", &info->sector_size, NULL); if (err) { xenbus_dev_fatal(info->xbdev, err, @@ -2368,9 +2368,9 @@ static void blkfront_connect(struct blkfront_info *info) * provide this. Assume physical sector size to be the same as * sector_size in that case. */ - physical_sector_size = xenbus_read_unsigned(info->xbdev->otherend, + info->physical_sector_size = xenbus_read_unsigned(info->xbdev->otherend, "physical-sector-size", - sector_size); + info->sector_size); blkfront_gather_backend_features(info); for_each_rinfo(info, rinfo, i) { err = blkfront_setup_indirect(rinfo); @@ -2382,8 +2382,7 @@ static void blkfront_connect(struct blkfront_info *info) } } - err = xlvbd_alloc_gendisk(sectors, info, sector_size, - physical_sector_size); + err = xlvbd_alloc_gendisk(sectors, info); if (err) { xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", info->xbdev->otherend); diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 11493167b0a8..8c1c7f4211eb 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -318,7 +318,7 @@ static int z2ram_register_disk(int minor) struct gendisk *disk; int err; - disk = blk_mq_alloc_disk(&tag_set, NULL); + disk = blk_mq_alloc_disk(&tag_set, NULL, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); @@ -354,7 +354,6 @@ static int __init z2_init(void) tag_set.nr_maps = 1; tag_set.queue_depth = 16; tag_set.numa_node = NUMA_NO_NODE; - tag_set.flags = BLK_MQ_F_SHOULD_MERGE; ret = blk_mq_alloc_tag_set(&tag_set); if (ret) goto out_unregister_blkdev; @@ -409,4 +408,5 @@ static void __exit z2_exit(void) module_init(z2_init); module_exit(z2_exit); +MODULE_DESCRIPTION("Amiga Zorro II ramdisk driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 7b29cce60ab2..402b7b175863 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -2,7 +2,6 @@ config ZRAM tristate "Compressed RAM block device support" depends on BLOCK && SYSFS && MMU - depends on CRYPTO_LZO || CRYPTO_ZSTD || CRYPTO_LZ4 || CRYPTO_LZ4HC || CRYPTO_842 select ZSMALLOC help Creates virtual block devices called /dev/zramX (X = 0, 1, ...). @@ -15,6 +14,49 @@ config ZRAM See Documentation/admin-guide/blockdev/zram.rst for more information. +config ZRAM_BACKEND_LZ4 + bool "lz4 compression support" + depends on ZRAM + select LZ4_COMPRESS + select LZ4_DECOMPRESS + +config ZRAM_BACKEND_LZ4HC + bool "lz4hc compression support" + depends on ZRAM + select LZ4HC_COMPRESS + select LZ4_DECOMPRESS + +config ZRAM_BACKEND_ZSTD + bool "zstd compression support" + depends on ZRAM + select ZSTD_COMPRESS + select ZSTD_DECOMPRESS + +config ZRAM_BACKEND_DEFLATE + bool "deflate compression support" + depends on ZRAM + select ZLIB_DEFLATE + select ZLIB_INFLATE + +config ZRAM_BACKEND_842 + bool "842 compression support" + depends on ZRAM + select 842_COMPRESS + select 842_DECOMPRESS + +config ZRAM_BACKEND_FORCE_LZO + depends on ZRAM + def_bool !ZRAM_BACKEND_LZ4 && !ZRAM_BACKEND_LZ4HC && \ + !ZRAM_BACKEND_ZSTD && !ZRAM_BACKEND_DEFLATE && \ + !ZRAM_BACKEND_842 + +config ZRAM_BACKEND_LZO + bool "lzo and lzo-rle compression support" if !ZRAM_BACKEND_FORCE_LZO + depends on ZRAM + default ZRAM_BACKEND_FORCE_LZO + select LZO_COMPRESS + select LZO_DECOMPRESS + choice prompt "Default zram compressor" default ZRAM_DEF_COMP_LZORLE @@ -22,38 +64,45 @@ choice config ZRAM_DEF_COMP_LZORLE bool "lzo-rle" - depends on CRYPTO_LZO + depends on ZRAM_BACKEND_LZO -config ZRAM_DEF_COMP_ZSTD - bool "zstd" - depends on CRYPTO_ZSTD +config ZRAM_DEF_COMP_LZO + bool "lzo" + depends on ZRAM_BACKEND_LZO config ZRAM_DEF_COMP_LZ4 bool "lz4" - depends on CRYPTO_LZ4 - -config ZRAM_DEF_COMP_LZO - bool "lzo" - depends on CRYPTO_LZO + depends on ZRAM_BACKEND_LZ4 config ZRAM_DEF_COMP_LZ4HC bool "lz4hc" - depends on CRYPTO_LZ4HC + depends on ZRAM_BACKEND_LZ4HC + +config ZRAM_DEF_COMP_ZSTD + bool "zstd" + depends on ZRAM_BACKEND_ZSTD + +config ZRAM_DEF_COMP_DEFLATE + bool "deflate" + depends on ZRAM_BACKEND_DEFLATE config ZRAM_DEF_COMP_842 bool "842" - depends on CRYPTO_842 + depends on ZRAM_BACKEND_842 endchoice config ZRAM_DEF_COMP string + depends on ZRAM default "lzo-rle" if ZRAM_DEF_COMP_LZORLE - default "zstd" if ZRAM_DEF_COMP_ZSTD - default "lz4" if ZRAM_DEF_COMP_LZ4 default "lzo" if ZRAM_DEF_COMP_LZO + default "lz4" if ZRAM_DEF_COMP_LZ4 default "lz4hc" if ZRAM_DEF_COMP_LZ4HC + default "zstd" if ZRAM_DEF_COMP_ZSTD + default "deflate" if ZRAM_DEF_COMP_DEFLATE default "842" if ZRAM_DEF_COMP_842 + default "unset-value" config ZRAM_WRITEBACK bool "Write back incompressible or idle page to backing device" diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile index de9e457907b1..0fdefd576691 100644 --- a/drivers/block/zram/Makefile +++ b/drivers/block/zram/Makefile @@ -1,4 +1,12 @@ # SPDX-License-Identifier: GPL-2.0-only + zram-y := zcomp.o zram_drv.o +zram-$(CONFIG_ZRAM_BACKEND_LZO) += backend_lzorle.o backend_lzo.o +zram-$(CONFIG_ZRAM_BACKEND_LZ4) += backend_lz4.o +zram-$(CONFIG_ZRAM_BACKEND_LZ4HC) += backend_lz4hc.o +zram-$(CONFIG_ZRAM_BACKEND_ZSTD) += backend_zstd.o +zram-$(CONFIG_ZRAM_BACKEND_DEFLATE) += backend_deflate.o +zram-$(CONFIG_ZRAM_BACKEND_842) += backend_842.o + obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/block/zram/backend_842.c b/drivers/block/zram/backend_842.c new file mode 100644 index 000000000000..10d9d5c60f53 --- /dev/null +++ b/drivers/block/zram/backend_842.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/sw842.h> +#include <linux/vmalloc.h> + +#include "backend_842.h" + +static void release_params_842(struct zcomp_params *params) +{ +} + +static int setup_params_842(struct zcomp_params *params) +{ + return 0; +} + +static void destroy_842(struct zcomp_ctx *ctx) +{ + kfree(ctx->context); +} + +static int create_842(struct zcomp_params *params, struct zcomp_ctx *ctx) +{ + ctx->context = kmalloc(SW842_MEM_COMPRESS, GFP_KERNEL); + if (!ctx->context) + return -ENOMEM; + return 0; +} + +static int compress_842(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + unsigned int dlen = req->dst_len; + int ret; + + ret = sw842_compress(req->src, req->src_len, req->dst, &dlen, + ctx->context); + if (ret == 0) + req->dst_len = dlen; + return ret; +} + +static int decompress_842(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + unsigned int dlen = req->dst_len; + + return sw842_decompress(req->src, req->src_len, req->dst, &dlen); +} + +const struct zcomp_ops backend_842 = { + .compress = compress_842, + .decompress = decompress_842, + .create_ctx = create_842, + .destroy_ctx = destroy_842, + .setup_params = setup_params_842, + .release_params = release_params_842, + .name = "842", +}; diff --git a/drivers/block/zram/backend_842.h b/drivers/block/zram/backend_842.h new file mode 100644 index 000000000000..4dc85c188799 --- /dev/null +++ b/drivers/block/zram/backend_842.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef __BACKEND_842_H__ +#define __BACKEND_842_H__ + +#include "zcomp.h" + +extern const struct zcomp_ops backend_842; + +#endif /* __BACKEND_842_H__ */ diff --git a/drivers/block/zram/backend_deflate.c b/drivers/block/zram/backend_deflate.c new file mode 100644 index 000000000000..0f7f252c12f4 --- /dev/null +++ b/drivers/block/zram/backend_deflate.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/zlib.h> + +#include "backend_deflate.h" + +/* Use the same value as crypto API */ +#define DEFLATE_DEF_WINBITS 11 +#define DEFLATE_DEF_MEMLEVEL MAX_MEM_LEVEL + +struct deflate_ctx { + struct z_stream_s cctx; + struct z_stream_s dctx; +}; + +static void deflate_release_params(struct zcomp_params *params) +{ +} + +static int deflate_setup_params(struct zcomp_params *params) +{ + if (params->level == ZCOMP_PARAM_NO_LEVEL) + params->level = Z_DEFAULT_COMPRESSION; + + return 0; +} + +static void deflate_destroy(struct zcomp_ctx *ctx) +{ + struct deflate_ctx *zctx = ctx->context; + + if (!zctx) + return; + + if (zctx->cctx.workspace) { + zlib_deflateEnd(&zctx->cctx); + vfree(zctx->cctx.workspace); + } + if (zctx->dctx.workspace) { + zlib_inflateEnd(&zctx->dctx); + vfree(zctx->dctx.workspace); + } + kfree(zctx); +} + +static int deflate_create(struct zcomp_params *params, struct zcomp_ctx *ctx) +{ + struct deflate_ctx *zctx; + size_t sz; + int ret; + + zctx = kzalloc(sizeof(*zctx), GFP_KERNEL); + if (!zctx) + return -ENOMEM; + + ctx->context = zctx; + sz = zlib_deflate_workspacesize(-DEFLATE_DEF_WINBITS, MAX_MEM_LEVEL); + zctx->cctx.workspace = vzalloc(sz); + if (!zctx->cctx.workspace) + goto error; + + ret = zlib_deflateInit2(&zctx->cctx, params->level, Z_DEFLATED, + -DEFLATE_DEF_WINBITS, DEFLATE_DEF_MEMLEVEL, + Z_DEFAULT_STRATEGY); + if (ret != Z_OK) + goto error; + + sz = zlib_inflate_workspacesize(); + zctx->dctx.workspace = vzalloc(sz); + if (!zctx->dctx.workspace) + goto error; + + ret = zlib_inflateInit2(&zctx->dctx, -DEFLATE_DEF_WINBITS); + if (ret != Z_OK) + goto error; + + return 0; + +error: + deflate_destroy(ctx); + return -EINVAL; +} + +static int deflate_compress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + struct deflate_ctx *zctx = ctx->context; + struct z_stream_s *deflate; + int ret; + + deflate = &zctx->cctx; + ret = zlib_deflateReset(deflate); + if (ret != Z_OK) + return -EINVAL; + + deflate->next_in = (u8 *)req->src; + deflate->avail_in = req->src_len; + deflate->next_out = (u8 *)req->dst; + deflate->avail_out = req->dst_len; + + ret = zlib_deflate(deflate, Z_FINISH); + if (ret != Z_STREAM_END) + return -EINVAL; + + req->dst_len = deflate->total_out; + return 0; +} + +static int deflate_decompress(struct zcomp_params *params, + struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + struct deflate_ctx *zctx = ctx->context; + struct z_stream_s *inflate; + int ret; + + inflate = &zctx->dctx; + + ret = zlib_inflateReset(inflate); + if (ret != Z_OK) + return -EINVAL; + + inflate->next_in = (u8 *)req->src; + inflate->avail_in = req->src_len; + inflate->next_out = (u8 *)req->dst; + inflate->avail_out = req->dst_len; + + ret = zlib_inflate(inflate, Z_SYNC_FLUSH); + if (ret != Z_STREAM_END) + return -EINVAL; + + return 0; +} + +const struct zcomp_ops backend_deflate = { + .compress = deflate_compress, + .decompress = deflate_decompress, + .create_ctx = deflate_create, + .destroy_ctx = deflate_destroy, + .setup_params = deflate_setup_params, + .release_params = deflate_release_params, + .name = "deflate", +}; diff --git a/drivers/block/zram/backend_deflate.h b/drivers/block/zram/backend_deflate.h new file mode 100644 index 000000000000..a39ac12b114c --- /dev/null +++ b/drivers/block/zram/backend_deflate.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef __BACKEND_DEFLATE_H__ +#define __BACKEND_DEFLATE_H__ + +#include "zcomp.h" + +extern const struct zcomp_ops backend_deflate; + +#endif /* __BACKEND_DEFLATE_H__ */ diff --git a/drivers/block/zram/backend_lz4.c b/drivers/block/zram/backend_lz4.c new file mode 100644 index 000000000000..847f3334eb38 --- /dev/null +++ b/drivers/block/zram/backend_lz4.c @@ -0,0 +1,127 @@ +#include <linux/kernel.h> +#include <linux/lz4.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#include "backend_lz4.h" + +struct lz4_ctx { + void *mem; + + LZ4_streamDecode_t *dstrm; + LZ4_stream_t *cstrm; +}; + +static void lz4_release_params(struct zcomp_params *params) +{ +} + +static int lz4_setup_params(struct zcomp_params *params) +{ + if (params->level == ZCOMP_PARAM_NO_LEVEL) + params->level = LZ4_ACCELERATION_DEFAULT; + + return 0; +} + +static void lz4_destroy(struct zcomp_ctx *ctx) +{ + struct lz4_ctx *zctx = ctx->context; + + if (!zctx) + return; + + vfree(zctx->mem); + kfree(zctx->dstrm); + kfree(zctx->cstrm); + kfree(zctx); +} + +static int lz4_create(struct zcomp_params *params, struct zcomp_ctx *ctx) +{ + struct lz4_ctx *zctx; + + zctx = kzalloc(sizeof(*zctx), GFP_KERNEL); + if (!zctx) + return -ENOMEM; + + ctx->context = zctx; + if (params->dict_sz == 0) { + zctx->mem = vmalloc(LZ4_MEM_COMPRESS); + if (!zctx->mem) + goto error; + } else { + zctx->dstrm = kzalloc(sizeof(*zctx->dstrm), GFP_KERNEL); + if (!zctx->dstrm) + goto error; + + zctx->cstrm = kzalloc(sizeof(*zctx->cstrm), GFP_KERNEL); + if (!zctx->cstrm) + goto error; + } + + return 0; + +error: + lz4_destroy(ctx); + return -ENOMEM; +} + +static int lz4_compress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + struct lz4_ctx *zctx = ctx->context; + int ret; + + if (!zctx->cstrm) { + ret = LZ4_compress_fast(req->src, req->dst, req->src_len, + req->dst_len, params->level, + zctx->mem); + } else { + /* Cstrm needs to be reset */ + ret = LZ4_loadDict(zctx->cstrm, params->dict, params->dict_sz); + if (ret != params->dict_sz) + return -EINVAL; + ret = LZ4_compress_fast_continue(zctx->cstrm, req->src, + req->dst, req->src_len, + req->dst_len, params->level); + } + if (!ret) + return -EINVAL; + req->dst_len = ret; + return 0; +} + +static int lz4_decompress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + struct lz4_ctx *zctx = ctx->context; + int ret; + + if (!zctx->dstrm) { + ret = LZ4_decompress_safe(req->src, req->dst, req->src_len, + req->dst_len); + } else { + /* Dstrm needs to be reset */ + ret = LZ4_setStreamDecode(zctx->dstrm, params->dict, + params->dict_sz); + if (!ret) + return -EINVAL; + ret = LZ4_decompress_safe_continue(zctx->dstrm, req->src, + req->dst, req->src_len, + req->dst_len); + } + if (ret < 0) + return -EINVAL; + return 0; +} + +const struct zcomp_ops backend_lz4 = { + .compress = lz4_compress, + .decompress = lz4_decompress, + .create_ctx = lz4_create, + .destroy_ctx = lz4_destroy, + .setup_params = lz4_setup_params, + .release_params = lz4_release_params, + .name = "lz4", +}; diff --git a/drivers/block/zram/backend_lz4.h b/drivers/block/zram/backend_lz4.h new file mode 100644 index 000000000000..c11fa602a703 --- /dev/null +++ b/drivers/block/zram/backend_lz4.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef __BACKEND_LZ4_H__ +#define __BACKEND_LZ4_H__ + +#include "zcomp.h" + +extern const struct zcomp_ops backend_lz4; + +#endif /* __BACKEND_LZ4_H__ */ diff --git a/drivers/block/zram/backend_lz4hc.c b/drivers/block/zram/backend_lz4hc.c new file mode 100644 index 000000000000..5f37d5abcaeb --- /dev/null +++ b/drivers/block/zram/backend_lz4hc.c @@ -0,0 +1,128 @@ +#include <linux/kernel.h> +#include <linux/lz4.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#include "backend_lz4hc.h" + +struct lz4hc_ctx { + void *mem; + + LZ4_streamDecode_t *dstrm; + LZ4_streamHC_t *cstrm; +}; + +static void lz4hc_release_params(struct zcomp_params *params) +{ +} + +static int lz4hc_setup_params(struct zcomp_params *params) +{ + if (params->level == ZCOMP_PARAM_NO_LEVEL) + params->level = LZ4HC_DEFAULT_CLEVEL; + + return 0; +} + +static void lz4hc_destroy(struct zcomp_ctx *ctx) +{ + struct lz4hc_ctx *zctx = ctx->context; + + if (!zctx) + return; + + kfree(zctx->dstrm); + kfree(zctx->cstrm); + vfree(zctx->mem); + kfree(zctx); +} + +static int lz4hc_create(struct zcomp_params *params, struct zcomp_ctx *ctx) +{ + struct lz4hc_ctx *zctx; + + zctx = kzalloc(sizeof(*zctx), GFP_KERNEL); + if (!zctx) + return -ENOMEM; + + ctx->context = zctx; + if (params->dict_sz == 0) { + zctx->mem = vmalloc(LZ4HC_MEM_COMPRESS); + if (!zctx->mem) + goto error; + } else { + zctx->dstrm = kzalloc(sizeof(*zctx->dstrm), GFP_KERNEL); + if (!zctx->dstrm) + goto error; + + zctx->cstrm = kzalloc(sizeof(*zctx->cstrm), GFP_KERNEL); + if (!zctx->cstrm) + goto error; + } + + return 0; + +error: + lz4hc_destroy(ctx); + return -EINVAL; +} + +static int lz4hc_compress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + struct lz4hc_ctx *zctx = ctx->context; + int ret; + + if (!zctx->cstrm) { + ret = LZ4_compress_HC(req->src, req->dst, req->src_len, + req->dst_len, params->level, + zctx->mem); + } else { + /* Cstrm needs to be reset */ + LZ4_resetStreamHC(zctx->cstrm, params->level); + ret = LZ4_loadDictHC(zctx->cstrm, params->dict, + params->dict_sz); + if (ret != params->dict_sz) + return -EINVAL; + ret = LZ4_compress_HC_continue(zctx->cstrm, req->src, req->dst, + req->src_len, req->dst_len); + } + if (!ret) + return -EINVAL; + req->dst_len = ret; + return 0; +} + +static int lz4hc_decompress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + struct lz4hc_ctx *zctx = ctx->context; + int ret; + + if (!zctx->dstrm) { + ret = LZ4_decompress_safe(req->src, req->dst, req->src_len, + req->dst_len); + } else { + /* Dstrm needs to be reset */ + ret = LZ4_setStreamDecode(zctx->dstrm, params->dict, + params->dict_sz); + if (!ret) + return -EINVAL; + ret = LZ4_decompress_safe_continue(zctx->dstrm, req->src, + req->dst, req->src_len, + req->dst_len); + } + if (ret < 0) + return -EINVAL; + return 0; +} + +const struct zcomp_ops backend_lz4hc = { + .compress = lz4hc_compress, + .decompress = lz4hc_decompress, + .create_ctx = lz4hc_create, + .destroy_ctx = lz4hc_destroy, + .setup_params = lz4hc_setup_params, + .release_params = lz4hc_release_params, + .name = "lz4hc", +}; diff --git a/drivers/block/zram/backend_lz4hc.h b/drivers/block/zram/backend_lz4hc.h new file mode 100644 index 000000000000..6de03551ed4d --- /dev/null +++ b/drivers/block/zram/backend_lz4hc.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef __BACKEND_LZ4HC_H__ +#define __BACKEND_LZ4HC_H__ + +#include "zcomp.h" + +extern const struct zcomp_ops backend_lz4hc; + +#endif /* __BACKEND_LZ4HC_H__ */ diff --git a/drivers/block/zram/backend_lzo.c b/drivers/block/zram/backend_lzo.c new file mode 100644 index 000000000000..4c906beaae6b --- /dev/null +++ b/drivers/block/zram/backend_lzo.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/lzo.h> + +#include "backend_lzo.h" + +static void lzo_release_params(struct zcomp_params *params) +{ +} + +static int lzo_setup_params(struct zcomp_params *params) +{ + return 0; +} + +static int lzo_create(struct zcomp_params *params, struct zcomp_ctx *ctx) +{ + ctx->context = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); + if (!ctx->context) + return -ENOMEM; + return 0; +} + +static void lzo_destroy(struct zcomp_ctx *ctx) +{ + kfree(ctx->context); +} + +static int lzo_compress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + int ret; + + ret = lzo1x_1_compress(req->src, req->src_len, req->dst, + &req->dst_len, ctx->context); + return ret == LZO_E_OK ? 0 : ret; +} + +static int lzo_decompress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + int ret; + + ret = lzo1x_decompress_safe(req->src, req->src_len, + req->dst, &req->dst_len); + return ret == LZO_E_OK ? 0 : ret; +} + +const struct zcomp_ops backend_lzo = { + .compress = lzo_compress, + .decompress = lzo_decompress, + .create_ctx = lzo_create, + .destroy_ctx = lzo_destroy, + .setup_params = lzo_setup_params, + .release_params = lzo_release_params, + .name = "lzo", +}; diff --git a/drivers/block/zram/backend_lzo.h b/drivers/block/zram/backend_lzo.h new file mode 100644 index 000000000000..93d54749e63c --- /dev/null +++ b/drivers/block/zram/backend_lzo.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef __BACKEND_LZO_H__ +#define __BACKEND_LZO_H__ + +#include "zcomp.h" + +extern const struct zcomp_ops backend_lzo; + +#endif /* __BACKEND_LZO_H__ */ diff --git a/drivers/block/zram/backend_lzorle.c b/drivers/block/zram/backend_lzorle.c new file mode 100644 index 000000000000..10640c96cbfc --- /dev/null +++ b/drivers/block/zram/backend_lzorle.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/lzo.h> + +#include "backend_lzorle.h" + +static void lzorle_release_params(struct zcomp_params *params) +{ +} + +static int lzorle_setup_params(struct zcomp_params *params) +{ + return 0; +} + +static int lzorle_create(struct zcomp_params *params, struct zcomp_ctx *ctx) +{ + ctx->context = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); + if (!ctx->context) + return -ENOMEM; + return 0; +} + +static void lzorle_destroy(struct zcomp_ctx *ctx) +{ + kfree(ctx->context); +} + +static int lzorle_compress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + int ret; + + ret = lzorle1x_1_compress(req->src, req->src_len, req->dst, + &req->dst_len, ctx->context); + return ret == LZO_E_OK ? 0 : ret; +} + +static int lzorle_decompress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + int ret; + + ret = lzo1x_decompress_safe(req->src, req->src_len, + req->dst, &req->dst_len); + return ret == LZO_E_OK ? 0 : ret; +} + +const struct zcomp_ops backend_lzorle = { + .compress = lzorle_compress, + .decompress = lzorle_decompress, + .create_ctx = lzorle_create, + .destroy_ctx = lzorle_destroy, + .setup_params = lzorle_setup_params, + .release_params = lzorle_release_params, + .name = "lzo-rle", +}; diff --git a/drivers/block/zram/backend_lzorle.h b/drivers/block/zram/backend_lzorle.h new file mode 100644 index 000000000000..6ecb163b09f1 --- /dev/null +++ b/drivers/block/zram/backend_lzorle.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef __BACKEND_LZORLE_H__ +#define __BACKEND_LZORLE_H__ + +#include "zcomp.h" + +extern const struct zcomp_ops backend_lzorle; + +#endif /* __BACKEND_LZORLE_H__ */ diff --git a/drivers/block/zram/backend_zstd.c b/drivers/block/zram/backend_zstd.c new file mode 100644 index 000000000000..1184c0036f44 --- /dev/null +++ b/drivers/block/zram/backend_zstd.c @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/zstd.h> + +#include "backend_zstd.h" + +struct zstd_ctx { + zstd_cctx *cctx; + zstd_dctx *dctx; + void *cctx_mem; + void *dctx_mem; +}; + +struct zstd_params { + zstd_custom_mem custom_mem; + zstd_cdict *cdict; + zstd_ddict *ddict; + zstd_parameters cprm; +}; + +/* + * For C/D dictionaries we need to provide zstd with zstd_custom_mem, + * which zstd uses internally to allocate/free memory when needed. + * + * This means that allocator.customAlloc() can be called from zcomp_compress() + * under local-lock (per-CPU compression stream), in which case we must use + * GFP_ATOMIC. + * + * Another complication here is that we can be configured as a swap device. + */ +static void *zstd_custom_alloc(void *opaque, size_t size) +{ + if (!preemptible()) + return kvzalloc(size, GFP_ATOMIC); + + return kvzalloc(size, __GFP_KSWAPD_RECLAIM | __GFP_NOWARN); +} + +static void zstd_custom_free(void *opaque, void *address) +{ + kvfree(address); +} + +static void zstd_release_params(struct zcomp_params *params) +{ + struct zstd_params *zp = params->drv_data; + + params->drv_data = NULL; + if (!zp) + return; + + zstd_free_cdict(zp->cdict); + zstd_free_ddict(zp->ddict); + kfree(zp); +} + +static int zstd_setup_params(struct zcomp_params *params) +{ + zstd_compression_parameters prm; + struct zstd_params *zp; + + zp = kzalloc(sizeof(*zp), GFP_KERNEL); + if (!zp) + return -ENOMEM; + + params->drv_data = zp; + if (params->level == ZCOMP_PARAM_NO_LEVEL) + params->level = zstd_default_clevel(); + + zp->cprm = zstd_get_params(params->level, PAGE_SIZE); + + zp->custom_mem.customAlloc = zstd_custom_alloc; + zp->custom_mem.customFree = zstd_custom_free; + + prm = zstd_get_cparams(params->level, PAGE_SIZE, + params->dict_sz); + + zp->cdict = zstd_create_cdict_byreference(params->dict, + params->dict_sz, + prm, + zp->custom_mem); + if (!zp->cdict) + goto error; + + zp->ddict = zstd_create_ddict_byreference(params->dict, + params->dict_sz, + zp->custom_mem); + if (!zp->ddict) + goto error; + + return 0; + +error: + zstd_release_params(params); + return -EINVAL; +} + +static void zstd_destroy(struct zcomp_ctx *ctx) +{ + struct zstd_ctx *zctx = ctx->context; + + if (!zctx) + return; + + /* + * If ->cctx_mem and ->dctx_mem were allocated then we didn't use + * C/D dictionary and ->cctx / ->dctx were "embedded" into these + * buffers. + * + * If otherwise then we need to explicitly release ->cctx / ->dctx. + */ + if (zctx->cctx_mem) + vfree(zctx->cctx_mem); + else + zstd_free_cctx(zctx->cctx); + + if (zctx->dctx_mem) + vfree(zctx->dctx_mem); + else + zstd_free_dctx(zctx->dctx); + + kfree(zctx); +} + +static int zstd_create(struct zcomp_params *params, struct zcomp_ctx *ctx) +{ + struct zstd_ctx *zctx; + zstd_parameters prm; + size_t sz; + + zctx = kzalloc(sizeof(*zctx), GFP_KERNEL); + if (!zctx) + return -ENOMEM; + + ctx->context = zctx; + if (params->dict_sz == 0) { + prm = zstd_get_params(params->level, PAGE_SIZE); + sz = zstd_cctx_workspace_bound(&prm.cParams); + zctx->cctx_mem = vzalloc(sz); + if (!zctx->cctx_mem) + goto error; + + zctx->cctx = zstd_init_cctx(zctx->cctx_mem, sz); + if (!zctx->cctx) + goto error; + + sz = zstd_dctx_workspace_bound(); + zctx->dctx_mem = vzalloc(sz); + if (!zctx->dctx_mem) + goto error; + + zctx->dctx = zstd_init_dctx(zctx->dctx_mem, sz); + if (!zctx->dctx) + goto error; + } else { + struct zstd_params *zp = params->drv_data; + + zctx->cctx = zstd_create_cctx_advanced(zp->custom_mem); + if (!zctx->cctx) + goto error; + + zctx->dctx = zstd_create_dctx_advanced(zp->custom_mem); + if (!zctx->dctx) + goto error; + } + + return 0; + +error: + zstd_release_params(params); + zstd_destroy(ctx); + return -EINVAL; +} + +static int zstd_compress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + struct zstd_params *zp = params->drv_data; + struct zstd_ctx *zctx = ctx->context; + size_t ret; + + if (params->dict_sz == 0) + ret = zstd_compress_cctx(zctx->cctx, req->dst, req->dst_len, + req->src, req->src_len, &zp->cprm); + else + ret = zstd_compress_using_cdict(zctx->cctx, req->dst, + req->dst_len, req->src, + req->src_len, + zp->cdict); + if (zstd_is_error(ret)) + return -EINVAL; + req->dst_len = ret; + return 0; +} + +static int zstd_decompress(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req) +{ + struct zstd_params *zp = params->drv_data; + struct zstd_ctx *zctx = ctx->context; + size_t ret; + + if (params->dict_sz == 0) + ret = zstd_decompress_dctx(zctx->dctx, req->dst, req->dst_len, + req->src, req->src_len); + else + ret = zstd_decompress_using_ddict(zctx->dctx, req->dst, + req->dst_len, req->src, + req->src_len, zp->ddict); + if (zstd_is_error(ret)) + return -EINVAL; + return 0; +} + +const struct zcomp_ops backend_zstd = { + .compress = zstd_compress, + .decompress = zstd_decompress, + .create_ctx = zstd_create, + .destroy_ctx = zstd_destroy, + .setup_params = zstd_setup_params, + .release_params = zstd_release_params, + .name = "zstd", +}; diff --git a/drivers/block/zram/backend_zstd.h b/drivers/block/zram/backend_zstd.h new file mode 100644 index 000000000000..10fdfff1ec1c --- /dev/null +++ b/drivers/block/zram/backend_zstd.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef __BACKEND_ZSTD_H__ +#define __BACKEND_ZSTD_H__ + +#include "zcomp.h" + +extern const struct zcomp_ops backend_zstd; + +#endif /* __BACKEND_ZSTD_H__ */ diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 55af4efd7983..bb514403e305 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -1,7 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (C) 2014 Sergey Senozhatsky. - */ #include <linux/kernel.h> #include <linux/string.h> @@ -11,94 +8,101 @@ #include <linux/sched.h> #include <linux/cpu.h> #include <linux/crypto.h> +#include <linux/vmalloc.h> #include "zcomp.h" -static const char * const backends[] = { -#if IS_ENABLED(CONFIG_CRYPTO_LZO) - "lzo", - "lzo-rle", +#include "backend_lzo.h" +#include "backend_lzorle.h" +#include "backend_lz4.h" +#include "backend_lz4hc.h" +#include "backend_zstd.h" +#include "backend_deflate.h" +#include "backend_842.h" + +static const struct zcomp_ops *backends[] = { +#if IS_ENABLED(CONFIG_ZRAM_BACKEND_LZO) + &backend_lzorle, + &backend_lzo, #endif -#if IS_ENABLED(CONFIG_CRYPTO_LZ4) - "lz4", +#if IS_ENABLED(CONFIG_ZRAM_BACKEND_LZ4) + &backend_lz4, #endif -#if IS_ENABLED(CONFIG_CRYPTO_LZ4HC) - "lz4hc", +#if IS_ENABLED(CONFIG_ZRAM_BACKEND_LZ4HC) + &backend_lz4hc, #endif -#if IS_ENABLED(CONFIG_CRYPTO_842) - "842", +#if IS_ENABLED(CONFIG_ZRAM_BACKEND_ZSTD) + &backend_zstd, #endif -#if IS_ENABLED(CONFIG_CRYPTO_ZSTD) - "zstd", +#if IS_ENABLED(CONFIG_ZRAM_BACKEND_DEFLATE) + &backend_deflate, #endif +#if IS_ENABLED(CONFIG_ZRAM_BACKEND_842) + &backend_842, +#endif + NULL }; -static void zcomp_strm_free(struct zcomp_strm *zstrm) +static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) { - if (!IS_ERR_OR_NULL(zstrm->tfm)) - crypto_free_comp(zstrm->tfm); - free_pages((unsigned long)zstrm->buffer, 1); - zstrm->tfm = NULL; + comp->ops->destroy_ctx(&zstrm->ctx); + vfree(zstrm->buffer); zstrm->buffer = NULL; } -/* - * Initialize zcomp_strm structure with ->tfm initialized by backend, and - * ->buffer. Return a negative value on error. - */ -static int zcomp_strm_init(struct zcomp_strm *zstrm, struct zcomp *comp) +static int zcomp_strm_init(struct zcomp *comp, struct zcomp_strm *zstrm) { - zstrm->tfm = crypto_alloc_comp(comp->name, 0, 0); + int ret; + + ret = comp->ops->create_ctx(comp->params, &zstrm->ctx); + if (ret) + return ret; + /* * allocate 2 pages. 1 for compressed data, plus 1 extra for the * case when compressed size is larger than the original one */ - zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); - if (IS_ERR_OR_NULL(zstrm->tfm) || !zstrm->buffer) { - zcomp_strm_free(zstrm); + zstrm->buffer = vzalloc(2 * PAGE_SIZE); + if (!zstrm->buffer) { + zcomp_strm_free(comp, zstrm); return -ENOMEM; } return 0; } +static const struct zcomp_ops *lookup_backend_ops(const char *comp) +{ + int i = 0; + + while (backends[i]) { + if (sysfs_streq(comp, backends[i]->name)) + break; + i++; + } + return backends[i]; +} + bool zcomp_available_algorithm(const char *comp) { - /* - * Crypto does not ignore a trailing new line symbol, - * so make sure you don't supply a string containing - * one. - * This also means that we permit zcomp initialisation - * with any compressing algorithm known to crypto api. - */ - return crypto_has_comp(comp, 0, 0) == 1; + return lookup_backend_ops(comp) != NULL; } /* show available compressors */ ssize_t zcomp_available_show(const char *comp, char *buf) { - bool known_algorithm = false; ssize_t sz = 0; int i; - for (i = 0; i < ARRAY_SIZE(backends); i++) { - if (!strcmp(comp, backends[i])) { - known_algorithm = true; + for (i = 0; i < ARRAY_SIZE(backends) - 1; i++) { + if (!strcmp(comp, backends[i]->name)) { sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, - "[%s] ", backends[i]); + "[%s] ", backends[i]->name); } else { sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, - "%s ", backends[i]); + "%s ", backends[i]->name); } } - /* - * Out-of-tree module known to crypto api or a missing - * entry in `backends'. - */ - if (!known_algorithm && crypto_has_comp(comp, 0, 0) == 1) - sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, - "[%s] ", comp); - sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); return sz; } @@ -114,38 +118,34 @@ void zcomp_stream_put(struct zcomp *comp) local_unlock(&comp->stream->lock); } -int zcomp_compress(struct zcomp_strm *zstrm, - const void *src, unsigned int *dst_len) +int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, + const void *src, unsigned int *dst_len) { - /* - * Our dst memory (zstrm->buffer) is always `2 * PAGE_SIZE' sized - * because sometimes we can endup having a bigger compressed data - * due to various reasons: for example compression algorithms tend - * to add some padding to the compressed buffer. Speaking of padding, - * comp algorithm `842' pads the compressed length to multiple of 8 - * and returns -ENOSP when the dst memory is not big enough, which - * is not something that ZRAM wants to see. We can handle the - * `compressed_size > PAGE_SIZE' case easily in ZRAM, but when we - * receive -ERRNO from the compressing backend we can't help it - * anymore. To make `842' happy we need to tell the exact size of - * the dst buffer, zram_drv will take care of the fact that - * compressed buffer is too big. - */ - *dst_len = PAGE_SIZE * 2; + struct zcomp_req req = { + .src = src, + .dst = zstrm->buffer, + .src_len = PAGE_SIZE, + .dst_len = 2 * PAGE_SIZE, + }; + int ret; - return crypto_comp_compress(zstrm->tfm, - src, PAGE_SIZE, - zstrm->buffer, dst_len); + ret = comp->ops->compress(comp->params, &zstrm->ctx, &req); + if (!ret) + *dst_len = req.dst_len; + return ret; } -int zcomp_decompress(struct zcomp_strm *zstrm, - const void *src, unsigned int src_len, void *dst) +int zcomp_decompress(struct zcomp *comp, struct zcomp_strm *zstrm, + const void *src, unsigned int src_len, void *dst) { - unsigned int dst_len = PAGE_SIZE; - - return crypto_comp_decompress(zstrm->tfm, - src, src_len, - dst, &dst_len); + struct zcomp_req req = { + .src = src, + .dst = dst, + .src_len = src_len, + .dst_len = PAGE_SIZE, + }; + + return comp->ops->decompress(comp->params, &zstrm->ctx, &req); } int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) @@ -157,7 +157,7 @@ int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) zstrm = per_cpu_ptr(comp->stream, cpu); local_lock_init(&zstrm->lock); - ret = zcomp_strm_init(zstrm, comp); + ret = zcomp_strm_init(comp, zstrm); if (ret) pr_err("Can't allocate a compression stream\n"); return ret; @@ -169,11 +169,11 @@ int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node) struct zcomp_strm *zstrm; zstrm = per_cpu_ptr(comp->stream, cpu); - zcomp_strm_free(zstrm); + zcomp_strm_free(comp, zstrm); return 0; } -static int zcomp_init(struct zcomp *comp) +static int zcomp_init(struct zcomp *comp, struct zcomp_params *params) { int ret; @@ -181,12 +181,19 @@ static int zcomp_init(struct zcomp *comp) if (!comp->stream) return -ENOMEM; + comp->params = params; + ret = comp->ops->setup_params(comp->params); + if (ret) + goto cleanup; + ret = cpuhp_state_add_instance(CPUHP_ZCOMP_PREPARE, &comp->node); if (ret < 0) goto cleanup; + return 0; cleanup: + comp->ops->release_params(comp->params); free_percpu(comp->stream); return ret; } @@ -194,37 +201,35 @@ cleanup: void zcomp_destroy(struct zcomp *comp) { cpuhp_state_remove_instance(CPUHP_ZCOMP_PREPARE, &comp->node); + comp->ops->release_params(comp->params); free_percpu(comp->stream); kfree(comp); } -/* - * search available compressors for requested algorithm. - * allocate new zcomp and initialize it. return compressing - * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL) - * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in - * case of allocation error, or any other error potentially - * returned by zcomp_init(). - */ -struct zcomp *zcomp_create(const char *alg) +struct zcomp *zcomp_create(const char *alg, struct zcomp_params *params) { struct zcomp *comp; int error; /* - * Crypto API will execute /sbin/modprobe if the compression module - * is not loaded yet. We must do it here, otherwise we are about to - * call /sbin/modprobe under CPU hot-plug lock. + * The backends array has a sentinel NULL value, so the minimum + * size is 1. In order to be valid the array, apart from the + * sentinel NULL element, should have at least one compression + * backend selected. */ - if (!zcomp_available_algorithm(alg)) - return ERR_PTR(-EINVAL); + BUILD_BUG_ON(ARRAY_SIZE(backends) <= 1); comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); if (!comp) return ERR_PTR(-ENOMEM); - comp->name = alg; - error = zcomp_init(comp); + comp->ops = lookup_backend_ops(alg); + if (!comp->ops) { + kfree(comp); + return ERR_PTR(-EINVAL); + } + + error = zcomp_init(comp, params); if (error) { kfree(comp); return ERR_PTR(error); diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index cdefdef93da8..ad5762813842 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -1,24 +1,70 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2014 Sergey Senozhatsky. - */ #ifndef _ZCOMP_H_ #define _ZCOMP_H_ + #include <linux/local_lock.h> +#define ZCOMP_PARAM_NO_LEVEL INT_MIN + +/* + * Immutable driver (backend) parameters. The driver may attach private + * data to it (e.g. driver representation of the dictionary, etc.). + * + * This data is kept per-comp and is shared among execution contexts. + */ +struct zcomp_params { + void *dict; + size_t dict_sz; + s32 level; + + void *drv_data; +}; + +/* + * Run-time driver context - scratch buffers, etc. It is modified during + * request execution (compression/decompression), cannot be shared, so + * it's in per-CPU area. + */ +struct zcomp_ctx { + void *context; +}; + struct zcomp_strm { - /* The members ->buffer and ->tfm are protected by ->lock. */ local_lock_t lock; - /* compression/decompression buffer */ + /* compression buffer */ void *buffer; - struct crypto_comp *tfm; + struct zcomp_ctx ctx; +}; + +struct zcomp_req { + const unsigned char *src; + const size_t src_len; + + unsigned char *dst; + size_t dst_len; +}; + +struct zcomp_ops { + int (*compress)(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req); + int (*decompress)(struct zcomp_params *params, struct zcomp_ctx *ctx, + struct zcomp_req *req); + + int (*create_ctx)(struct zcomp_params *params, struct zcomp_ctx *ctx); + void (*destroy_ctx)(struct zcomp_ctx *ctx); + + int (*setup_params)(struct zcomp_params *params); + void (*release_params)(struct zcomp_params *params); + + const char *name; }; /* dynamic per-device compression frontend */ struct zcomp { struct zcomp_strm __percpu *stream; - const char *name; + const struct zcomp_ops *ops; + struct zcomp_params *params; struct hlist_node node; }; @@ -27,17 +73,15 @@ int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node); ssize_t zcomp_available_show(const char *comp, char *buf); bool zcomp_available_algorithm(const char *comp); -struct zcomp *zcomp_create(const char *alg); +struct zcomp *zcomp_create(const char *alg, struct zcomp_params *params); void zcomp_destroy(struct zcomp *comp); struct zcomp_strm *zcomp_stream_get(struct zcomp *comp); void zcomp_stream_put(struct zcomp *comp); -int zcomp_compress(struct zcomp_strm *zstrm, - const void *src, unsigned int *dst_len); - -int zcomp_decompress(struct zcomp_strm *zstrm, - const void *src, unsigned int src_len, void *dst); +int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, + const void *src, unsigned int *dst_len); +int zcomp_decompress(struct zcomp *comp, struct zcomp_strm *zstrm, + const void *src, unsigned int src_len, void *dst); -bool zcomp_set_max_streams(struct zcomp *comp, int num_strm); #endif /* _ZCOMP_H_ */ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 6772e0c654fa..9f5020b077c5 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -33,6 +33,7 @@ #include <linux/debugfs.h> #include <linux/cpuhotplug.h> #include <linux/part_stat.h> +#include <linux/kernel_read_file.h> #include "zram_drv.h" @@ -54,22 +55,22 @@ static size_t huge_class_size; static const struct block_device_operations zram_devops; static void zram_free_page(struct zram *zram, size_t index); -static int zram_read_page(struct zram *zram, struct page *page, u32 index, - struct bio *parent); +static int zram_read_from_zspool(struct zram *zram, struct page *page, + u32 index); static int zram_slot_trylock(struct zram *zram, u32 index) { - return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); + return spin_trylock(&zram->table[index].lock); } static void zram_slot_lock(struct zram *zram, u32 index) { - bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags); + spin_lock(&zram->table[index].lock); } static void zram_slot_unlock(struct zram *zram, u32 index) { - bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); + spin_unlock(&zram->table[index].lock); } static inline bool init_done(struct zram *zram) @@ -111,17 +112,6 @@ static void zram_clear_flag(struct zram *zram, u32 index, zram->table[index].flags &= ~BIT(flag); } -static inline void zram_set_element(struct zram *zram, u32 index, - unsigned long element) -{ - zram->table[index].element = element; -} - -static unsigned long zram_get_element(struct zram *zram, u32 index) -{ - return zram->table[index].element; -} - static size_t zram_get_obj_size(struct zram *zram, u32 index) { return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1); @@ -142,6 +132,27 @@ static inline bool zram_allocated(struct zram *zram, u32 index) zram_test_flag(zram, index, ZRAM_WB); } +static inline void update_used_max(struct zram *zram, const unsigned long pages) +{ + unsigned long cur_max = atomic_long_read(&zram->stats.max_used_pages); + + do { + if (cur_max >= pages) + return; + } while (!atomic_long_try_cmpxchg(&zram->stats.max_used_pages, + &cur_max, pages)); +} + +static bool zram_can_store_page(struct zram *zram) +{ + unsigned long alloced_pages; + + alloced_pages = zs_get_total_pages(zram->mem_pool); + update_used_max(zram, alloced_pages); + + return !zram->limit_pages || alloced_pages <= zram->limit_pages; +} + #if PAGE_SIZE != 4096 static inline bool is_partial_io(struct bio_vec *bvec) { @@ -177,23 +188,105 @@ static inline u32 zram_get_priority(struct zram *zram, u32 index) static void zram_accessed(struct zram *zram, u32 index) { zram_clear_flag(zram, index, ZRAM_IDLE); + zram_clear_flag(zram, index, ZRAM_PP_SLOT); #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME zram->table[index].ac_time = ktime_get_boottime(); #endif } -static inline void update_used_max(struct zram *zram, - const unsigned long pages) +#if defined CONFIG_ZRAM_WRITEBACK || defined CONFIG_ZRAM_MULTI_COMP +struct zram_pp_slot { + unsigned long index; + struct list_head entry; +}; + +/* + * A post-processing bucket is, essentially, a size class, this defines + * the range (in bytes) of pp-slots sizes in particular bucket. + */ +#define PP_BUCKET_SIZE_RANGE 64 +#define NUM_PP_BUCKETS ((PAGE_SIZE / PP_BUCKET_SIZE_RANGE) + 1) + +struct zram_pp_ctl { + struct list_head pp_buckets[NUM_PP_BUCKETS]; +}; + +static struct zram_pp_ctl *init_pp_ctl(void) { - unsigned long cur_max = atomic_long_read(&zram->stats.max_used_pages); + struct zram_pp_ctl *ctl; + u32 idx; - do { - if (cur_max >= pages) - return; - } while (!atomic_long_try_cmpxchg(&zram->stats.max_used_pages, - &cur_max, pages)); + ctl = kmalloc(sizeof(*ctl), GFP_KERNEL); + if (!ctl) + return NULL; + + for (idx = 0; idx < NUM_PP_BUCKETS; idx++) + INIT_LIST_HEAD(&ctl->pp_buckets[idx]); + return ctl; +} + +static void release_pp_slot(struct zram *zram, struct zram_pp_slot *pps) +{ + list_del_init(&pps->entry); + + zram_slot_lock(zram, pps->index); + zram_clear_flag(zram, pps->index, ZRAM_PP_SLOT); + zram_slot_unlock(zram, pps->index); + + kfree(pps); +} + +static void release_pp_ctl(struct zram *zram, struct zram_pp_ctl *ctl) +{ + u32 idx; + + if (!ctl) + return; + + for (idx = 0; idx < NUM_PP_BUCKETS; idx++) { + while (!list_empty(&ctl->pp_buckets[idx])) { + struct zram_pp_slot *pps; + + pps = list_first_entry(&ctl->pp_buckets[idx], + struct zram_pp_slot, + entry); + release_pp_slot(zram, pps); + } + } + + kfree(ctl); +} + +static void place_pp_slot(struct zram *zram, struct zram_pp_ctl *ctl, + struct zram_pp_slot *pps) +{ + u32 idx; + + idx = zram_get_obj_size(zram, pps->index) / PP_BUCKET_SIZE_RANGE; + list_add(&pps->entry, &ctl->pp_buckets[idx]); + + zram_set_flag(zram, pps->index, ZRAM_PP_SLOT); } +static struct zram_pp_slot *select_pp_slot(struct zram_pp_ctl *ctl) +{ + struct zram_pp_slot *pps = NULL; + s32 idx = NUM_PP_BUCKETS - 1; + + /* The higher the bucket id the more optimal slot post-processing is */ + while (idx >= 0) { + pps = list_first_entry_or_null(&ctl->pp_buckets[idx], + struct zram_pp_slot, + entry); + if (pps) + break; + + idx--; + } + return pps; +} +#endif + static inline void zram_fill_page(void *ptr, unsigned long len, unsigned long value) { @@ -295,19 +388,28 @@ static void mark_idle(struct zram *zram, ktime_t cutoff) for (index = 0; index < nr_pages; index++) { /* - * Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race. - * See the comment in writeback_store. + * Do not mark ZRAM_SAME slots as ZRAM_IDLE, because no + * post-processing (recompress, writeback) happens to the + * ZRAM_SAME slot. + * + * And ZRAM_WB slots simply cannot be ZRAM_IDLE. */ zram_slot_lock(zram, index); - if (zram_allocated(zram, index) && - !zram_test_flag(zram, index, ZRAM_UNDER_WB)) { + if (!zram_allocated(zram, index) || + zram_test_flag(zram, index, ZRAM_WB) || + zram_test_flag(zram, index, ZRAM_SAME)) { + zram_slot_unlock(zram, index); + continue; + } + #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME - is_idle = !cutoff || ktime_after(cutoff, - zram->table[index].ac_time); + is_idle = !cutoff || + ktime_after(cutoff, zram->table[index].ac_time); #endif - if (is_idle) - zram_set_flag(zram, index, ZRAM_IDLE); - } + if (is_idle) + zram_set_flag(zram, index, ZRAM_IDLE); + else + zram_clear_flag(zram, index, ZRAM_IDLE); zram_slot_unlock(zram, index); } } @@ -426,11 +528,10 @@ static void reset_bdev(struct zram *zram) if (!zram->backing_dev) return; - bdev_release(zram->bdev_handle); /* hope filp_close flush all of IO */ filp_close(zram->backing_dev, NULL); zram->backing_dev = NULL; - zram->bdev_handle = NULL; + zram->bdev = NULL; zram->disk->fops = &zram_devops; kvfree(zram->bitmap); zram->bitmap = NULL; @@ -473,10 +574,8 @@ static ssize_t backing_dev_store(struct device *dev, size_t sz; struct file *backing_dev = NULL; struct inode *inode; - struct address_space *mapping; unsigned int bitmap_sz; unsigned long nr_pages, *bitmap = NULL; - struct bdev_handle *bdev_handle = NULL; int err; struct zram *zram = dev_to_zram(dev); @@ -497,15 +596,14 @@ static ssize_t backing_dev_store(struct device *dev, if (sz > 0 && file_name[sz - 1] == '\n') file_name[sz - 1] = 0x00; - backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0); + backing_dev = filp_open(file_name, O_RDWR | O_LARGEFILE | O_EXCL, 0); if (IS_ERR(backing_dev)) { err = PTR_ERR(backing_dev); backing_dev = NULL; goto out; } - mapping = backing_dev->f_mapping; - inode = mapping->host; + inode = backing_dev->f_mapping->host; /* Support only block device in this moment */ if (!S_ISBLK(inode->i_mode)) { @@ -513,15 +611,13 @@ static ssize_t backing_dev_store(struct device *dev, goto out; } - bdev_handle = bdev_open_by_dev(inode->i_rdev, - BLK_OPEN_READ | BLK_OPEN_WRITE, zram, NULL); - if (IS_ERR(bdev_handle)) { - err = PTR_ERR(bdev_handle); - bdev_handle = NULL; + nr_pages = i_size_read(inode) >> PAGE_SHIFT; + /* Refuse to use zero sized device (also prevents self reference) */ + if (!nr_pages) { + err = -EINVAL; goto out; } - nr_pages = i_size_read(inode) >> PAGE_SHIFT; bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long); bitmap = kvzalloc(bitmap_sz, GFP_KERNEL); if (!bitmap) { @@ -531,7 +627,7 @@ static ssize_t backing_dev_store(struct device *dev, reset_bdev(zram); - zram->bdev_handle = bdev_handle; + zram->bdev = I_BDEV(inode); zram->backing_dev = backing_dev; zram->bitmap = bitmap; zram->nr_pages = nr_pages; @@ -544,9 +640,6 @@ static ssize_t backing_dev_store(struct device *dev, out: kvfree(bitmap); - if (bdev_handle) - bdev_release(bdev_handle); - if (backing_dev) filp_close(backing_dev, NULL); @@ -587,7 +680,7 @@ static void read_from_bdev_async(struct zram *zram, struct page *page, { struct bio *bio; - bio = bio_alloc(zram->bdev_handle->bdev, 1, parent->bi_opf, GFP_NOIO); + bio = bio_alloc(zram->bdev, 1, parent->bi_opf, GFP_NOIO); bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9); __bio_add_page(bio, page, PAGE_SIZE, 0); bio_chain(bio, parent); @@ -601,11 +694,57 @@ static void read_from_bdev_async(struct zram *zram, struct page *page, #define IDLE_WRITEBACK (1<<1) #define INCOMPRESSIBLE_WRITEBACK (1<<2) +static int scan_slots_for_writeback(struct zram *zram, u32 mode, + unsigned long nr_pages, + unsigned long index, + struct zram_pp_ctl *ctl) +{ + struct zram_pp_slot *pps = NULL; + + for (; nr_pages != 0; index++, nr_pages--) { + if (!pps) + pps = kmalloc(sizeof(*pps), GFP_KERNEL); + if (!pps) + return -ENOMEM; + + INIT_LIST_HEAD(&pps->entry); + + zram_slot_lock(zram, index); + if (!zram_allocated(zram, index)) + goto next; + + if (zram_test_flag(zram, index, ZRAM_WB) || + zram_test_flag(zram, index, ZRAM_SAME)) + goto next; + + if (mode & IDLE_WRITEBACK && + !zram_test_flag(zram, index, ZRAM_IDLE)) + goto next; + if (mode & HUGE_WRITEBACK && + !zram_test_flag(zram, index, ZRAM_HUGE)) + goto next; + if (mode & INCOMPRESSIBLE_WRITEBACK && + !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) + goto next; + + pps->index = index; + place_pp_slot(zram, ctl, pps); + pps = NULL; +next: + zram_slot_unlock(zram, index); + } + + kfree(pps); + return 0; +} + static ssize_t writeback_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; + struct zram_pp_ctl *ctl = NULL; + struct zram_pp_slot *pps; unsigned long index = 0; struct bio bio; struct bio_vec bio_vec; @@ -640,6 +779,12 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } + /* Do not permit concurrent post-processing actions. */ + if (atomic_xchg(&zram->pp_in_progress, 1)) { + up_read(&zram->init_lock); + return -EAGAIN; + } + if (!zram->backing_dev) { ret = -ENODEV; goto release_init_lock; @@ -651,7 +796,15 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - for (; nr_pages != 0; index++, nr_pages--) { + ctl = init_pp_ctl(); + if (!ctl) { + ret = -ENOMEM; + goto release_init_lock; + } + + scan_slots_for_writeback(zram, mode, nr_pages, index, ctl); + + while ((pps = select_pp_slot(ctl))) { spin_lock(&zram->wb_limit_lock); if (zram->wb_limit_enable && !zram->bd_wb_limit) { spin_unlock(&zram->wb_limit_lock); @@ -668,42 +821,21 @@ static ssize_t writeback_store(struct device *dev, } } + index = pps->index; zram_slot_lock(zram, index); - if (!zram_allocated(zram, index)) - goto next; - - if (zram_test_flag(zram, index, ZRAM_WB) || - zram_test_flag(zram, index, ZRAM_SAME) || - zram_test_flag(zram, index, ZRAM_UNDER_WB)) - goto next; - - if (mode & IDLE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_IDLE)) - goto next; - if (mode & HUGE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_HUGE)) - goto next; - if (mode & INCOMPRESSIBLE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) - goto next; - /* - * Clearing ZRAM_UNDER_WB is duty of caller. - * IOW, zram_free_page never clear it. + * scan_slots() sets ZRAM_PP_SLOT and relases slot lock, so + * slots can change in the meantime. If slots are accessed or + * freed they lose ZRAM_PP_SLOT flag and hence we don't + * post-process them. */ - zram_set_flag(zram, index, ZRAM_UNDER_WB); - /* Need for hugepage writeback racing */ - zram_set_flag(zram, index, ZRAM_IDLE); + if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) + goto next; + if (zram_read_from_zspool(zram, page, index)) + goto next; zram_slot_unlock(zram, index); - if (zram_read_page(zram, page, index, NULL)) { - zram_slot_lock(zram, index); - zram_clear_flag(zram, index, ZRAM_UNDER_WB); - zram_clear_flag(zram, index, ZRAM_IDLE); - zram_slot_unlock(zram, index); - continue; - } - bio_init(&bio, zram->bdev_handle->bdev, &bio_vec, 1, + bio_init(&bio, zram->bdev, &bio_vec, 1, REQ_OP_WRITE | REQ_SYNC); bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9); __bio_add_page(&bio, page, PAGE_SIZE, 0); @@ -714,10 +846,7 @@ static ssize_t writeback_store(struct device *dev, */ err = submit_bio_wait(&bio); if (err) { - zram_slot_lock(zram, index); - zram_clear_flag(zram, index, ZRAM_UNDER_WB); - zram_clear_flag(zram, index, ZRAM_IDLE); - zram_slot_unlock(zram, index); + release_pp_slot(zram, pps); /* * BIO errors are not fatal, we continue and simply * attempt to writeback the remaining objects (pages). @@ -731,27 +860,21 @@ static ssize_t writeback_store(struct device *dev, } atomic64_inc(&zram->stats.bd_writes); + zram_slot_lock(zram, index); /* - * We released zram_slot_lock so need to check if the slot was - * changed. If there is freeing for the slot, we can catch it - * easily by zram_allocated. - * A subtle case is the slot is freed/reallocated/marked as - * ZRAM_IDLE again. To close the race, idle_store doesn't - * mark ZRAM_IDLE once it found the slot was ZRAM_UNDER_WB. - * Thus, we could close the race by checking ZRAM_IDLE bit. + * Same as above, we release slot lock during writeback so + * slot can change under us: slot_free() or slot_free() and + * reallocation (zram_write_page()). In both cases slot loses + * ZRAM_PP_SLOT flag. No concurrent post-processing can set + * ZRAM_PP_SLOT on such slots until current post-processing + * finishes. */ - zram_slot_lock(zram, index); - if (!zram_allocated(zram, index) || - !zram_test_flag(zram, index, ZRAM_IDLE)) { - zram_clear_flag(zram, index, ZRAM_UNDER_WB); - zram_clear_flag(zram, index, ZRAM_IDLE); + if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) goto next; - } zram_free_page(zram, index); - zram_clear_flag(zram, index, ZRAM_UNDER_WB); zram_set_flag(zram, index, ZRAM_WB); - zram_set_element(zram, index, blk_idx); + zram_set_handle(zram, index, blk_idx); blk_idx = 0; atomic64_inc(&zram->stats.pages_stored); spin_lock(&zram->wb_limit_lock); @@ -760,12 +883,17 @@ static ssize_t writeback_store(struct device *dev, spin_unlock(&zram->wb_limit_lock); next: zram_slot_unlock(zram, index); + release_pp_slot(zram, pps); + + cond_resched(); } if (blk_idx) free_block_bdev(zram, blk_idx); __free_page(page); release_init_lock: + release_pp_ctl(zram, ctl); + atomic_set(&zram->pp_in_progress, 0); up_read(&zram->init_lock); return ret; @@ -785,7 +913,7 @@ static void zram_sync_read(struct work_struct *work) struct bio_vec bv; struct bio bio; - bio_init(&bio, zw->zram->bdev_handle->bdev, &bv, 1, REQ_OP_READ); + bio_init(&bio, zw->zram->bdev, &bv, 1, REQ_OP_READ); bio.bi_iter.bi_sector = zw->entry * (PAGE_SIZE >> 9); __bio_add_page(&bio, zw->page, PAGE_SIZE, 0); zw->error = submit_bio_wait(&bio); @@ -1013,6 +1141,103 @@ static int __comp_algorithm_store(struct zram *zram, u32 prio, const char *buf) return 0; } +static void comp_params_reset(struct zram *zram, u32 prio) +{ + struct zcomp_params *params = &zram->params[prio]; + + vfree(params->dict); + params->level = ZCOMP_PARAM_NO_LEVEL; + params->dict_sz = 0; + params->dict = NULL; +} + +static int comp_params_store(struct zram *zram, u32 prio, s32 level, + const char *dict_path) +{ + ssize_t sz = 0; + + comp_params_reset(zram, prio); + + if (dict_path) { + sz = kernel_read_file_from_path(dict_path, 0, + &zram->params[prio].dict, + INT_MAX, + NULL, + READING_POLICY); + if (sz < 0) + return -EINVAL; + } + + zram->params[prio].dict_sz = sz; + zram->params[prio].level = level; + return 0; +} + +static ssize_t algorithm_params_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t len) +{ + s32 prio = ZRAM_PRIMARY_COMP, level = ZCOMP_PARAM_NO_LEVEL; + char *args, *param, *val, *algo = NULL, *dict_path = NULL; + struct zram *zram = dev_to_zram(dev); + int ret; + + args = skip_spaces(buf); + while (*args) { + args = next_arg(args, ¶m, &val); + + if (!val || !*val) + return -EINVAL; + + if (!strcmp(param, "priority")) { + ret = kstrtoint(val, 10, &prio); + if (ret) + return ret; + continue; + } + + if (!strcmp(param, "level")) { + ret = kstrtoint(val, 10, &level); + if (ret) + return ret; + continue; + } + + if (!strcmp(param, "algo")) { + algo = val; + continue; + } + + if (!strcmp(param, "dict")) { + dict_path = val; + continue; + } + } + + /* Lookup priority by algorithm name */ + if (algo) { + s32 p; + + prio = -EINVAL; + for (p = ZRAM_PRIMARY_COMP; p < ZRAM_MAX_COMPS; p++) { + if (!zram->comp_algs[p]) + continue; + + if (!strcmp(zram->comp_algs[p], algo)) { + prio = p; + break; + } + } + } + + if (prio < ZRAM_PRIMARY_COMP || prio >= ZRAM_MAX_COMPS) + return -EINVAL; + + ret = comp_params_store(zram, prio, level, dict_path); + return ret ? ret : len; +} + static ssize_t comp_algorithm_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -1216,17 +1441,21 @@ static void zram_meta_free(struct zram *zram, u64 disksize) size_t num_pages = disksize >> PAGE_SHIFT; size_t index; + if (!zram->table) + return; + /* Free all pages that are still in this zram device */ for (index = 0; index < num_pages; index++) zram_free_page(zram, index); zs_destroy_pool(zram->mem_pool); vfree(zram->table); + zram->table = NULL; } static bool zram_meta_alloc(struct zram *zram, u64 disksize) { - size_t num_pages; + size_t num_pages, index; num_pages = disksize >> PAGE_SHIFT; zram->table = vzalloc(array_size(num_pages, sizeof(*zram->table))); @@ -1236,11 +1465,15 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) zram->mem_pool = zs_create_pool(zram->disk->disk_name); if (!zram->mem_pool) { vfree(zram->table); + zram->table = NULL; return false; } if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); + + for (index = 0; index < num_pages; index++) + spin_lock_init(&zram->table[index].lock); return true; } @@ -1256,22 +1489,20 @@ static void zram_free_page(struct zram *zram, size_t index) #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME zram->table[index].ac_time = 0; #endif - if (zram_test_flag(zram, index, ZRAM_IDLE)) - zram_clear_flag(zram, index, ZRAM_IDLE); + + zram_clear_flag(zram, index, ZRAM_IDLE); + zram_clear_flag(zram, index, ZRAM_INCOMPRESSIBLE); + zram_clear_flag(zram, index, ZRAM_PP_SLOT); + zram_set_priority(zram, index, 0); if (zram_test_flag(zram, index, ZRAM_HUGE)) { zram_clear_flag(zram, index, ZRAM_HUGE); atomic64_dec(&zram->stats.huge_pages); } - if (zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) - zram_clear_flag(zram, index, ZRAM_INCOMPRESSIBLE); - - zram_set_priority(zram, index, 0); - if (zram_test_flag(zram, index, ZRAM_WB)) { zram_clear_flag(zram, index, ZRAM_WB); - free_block_bdev(zram, zram_get_element(zram, index)); + free_block_bdev(zram, zram_get_handle(zram, index)); goto out; } @@ -1292,64 +1523,80 @@ static void zram_free_page(struct zram *zram, size_t index) zs_free(zram->mem_pool, handle); atomic64_sub(zram_get_obj_size(zram, index), - &zram->stats.compr_data_size); + &zram->stats.compr_data_size); out: atomic64_dec(&zram->stats.pages_stored); zram_set_handle(zram, index, 0); zram_set_obj_size(zram, index, 0); - WARN_ON_ONCE(zram->table[index].flags & - ~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB)); } -/* - * Reads (decompresses if needed) a page from zspool (zsmalloc). - * Corresponding ZRAM slot should be locked. - */ -static int zram_read_from_zspool(struct zram *zram, struct page *page, +static int read_same_filled_page(struct zram *zram, struct page *page, u32 index) { + void *mem; + + mem = kmap_local_page(page); + zram_fill_page(mem, PAGE_SIZE, zram_get_handle(zram, index)); + kunmap_local(mem); + return 0; +} + +static int read_incompressible_page(struct zram *zram, struct page *page, + u32 index) +{ + unsigned long handle; + void *src, *dst; + + handle = zram_get_handle(zram, index); + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); + dst = kmap_local_page(page); + copy_page(dst, src); + kunmap_local(dst); + zs_unmap_object(zram->mem_pool, handle); + + return 0; +} + +static int read_compressed_page(struct zram *zram, struct page *page, u32 index) +{ struct zcomp_strm *zstrm; unsigned long handle; unsigned int size; void *src, *dst; - u32 prio; - int ret; + int ret, prio; handle = zram_get_handle(zram, index); - if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) { - unsigned long value; - void *mem; - - value = handle ? zram_get_element(zram, index) : 0; - mem = kmap_local_page(page); - zram_fill_page(mem, PAGE_SIZE, value); - kunmap_local(mem); - return 0; - } - size = zram_get_obj_size(zram, index); + prio = zram_get_priority(zram, index); - if (size != PAGE_SIZE) { - prio = zram_get_priority(zram, index); - zstrm = zcomp_stream_get(zram->comps[prio]); - } - + zstrm = zcomp_stream_get(zram->comps[prio]); src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); - if (size == PAGE_SIZE) { - dst = kmap_local_page(page); - memcpy(dst, src, PAGE_SIZE); - kunmap_local(dst); - ret = 0; - } else { - dst = kmap_local_page(page); - ret = zcomp_decompress(zstrm, src, size, dst); - kunmap_local(dst); - zcomp_stream_put(zram->comps[prio]); - } + dst = kmap_local_page(page); + ret = zcomp_decompress(zram->comps[prio], zstrm, src, size, dst); + kunmap_local(dst); zs_unmap_object(zram->mem_pool, handle); + zcomp_stream_put(zram->comps[prio]); + return ret; } +/* + * Reads (decompresses if needed) a page from zspool (zsmalloc). + * Corresponding ZRAM slot should be locked. + */ +static int zram_read_from_zspool(struct zram *zram, struct page *page, + u32 index) +{ + if (zram_test_flag(zram, index, ZRAM_SAME) || + !zram_get_handle(zram, index)) + return read_same_filled_page(zram, page, index); + + if (!zram_test_flag(zram, index, ZRAM_HUGE)) + return read_compressed_page(zram, page, index); + else + return read_incompressible_page(zram, page, index); +} + static int zram_read_page(struct zram *zram, struct page *page, u32 index, struct bio *parent) { @@ -1367,7 +1614,7 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index, */ zram_slot_unlock(zram, index); - ret = read_from_bdev(zram, page, zram_get_element(zram, index), + ret = read_from_bdev(zram, page, zram_get_handle(zram, index), parent); } @@ -1405,32 +1652,88 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, return zram_read_page(zram, bvec->bv_page, index, bio); } +static int write_same_filled_page(struct zram *zram, unsigned long fill, + u32 index) +{ + zram_slot_lock(zram, index); + zram_set_flag(zram, index, ZRAM_SAME); + zram_set_handle(zram, index, fill); + zram_slot_unlock(zram, index); + + atomic64_inc(&zram->stats.same_pages); + atomic64_inc(&zram->stats.pages_stored); + + return 0; +} + +static int write_incompressible_page(struct zram *zram, struct page *page, + u32 index) +{ + unsigned long handle; + void *src, *dst; + + /* + * This function is called from preemptible context so we don't need + * to do optimistic and fallback to pessimistic handle allocation, + * like we do for compressible pages. + */ + handle = zs_malloc(zram->mem_pool, PAGE_SIZE, + GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE); + if (IS_ERR_VALUE(handle)) + return PTR_ERR((void *)handle); + + if (!zram_can_store_page(zram)) { + zs_free(zram->mem_pool, handle); + return -ENOMEM; + } + + dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); + src = kmap_local_page(page); + memcpy(dst, src, PAGE_SIZE); + kunmap_local(src); + zs_unmap_object(zram->mem_pool, handle); + + zram_slot_lock(zram, index); + zram_set_flag(zram, index, ZRAM_HUGE); + zram_set_handle(zram, index, handle); + zram_set_obj_size(zram, index, PAGE_SIZE); + zram_slot_unlock(zram, index); + + atomic64_add(PAGE_SIZE, &zram->stats.compr_data_size); + atomic64_inc(&zram->stats.huge_pages); + atomic64_inc(&zram->stats.huge_pages_since); + atomic64_inc(&zram->stats.pages_stored); + + return 0; +} + static int zram_write_page(struct zram *zram, struct page *page, u32 index) { int ret = 0; - unsigned long alloced_pages; unsigned long handle = -ENOMEM; unsigned int comp_len = 0; - void *src, *dst, *mem; + void *dst, *mem; struct zcomp_strm *zstrm; unsigned long element = 0; - enum zram_pageflags flags = 0; + bool same_filled; + + /* First, free memory allocated to this slot (if any) */ + zram_slot_lock(zram, index); + zram_free_page(zram, index); + zram_slot_unlock(zram, index); mem = kmap_local_page(page); - if (page_same_filled(mem, &element)) { - kunmap_local(mem); - /* Free memory associated with this sector now. */ - flags = ZRAM_SAME; - atomic64_inc(&zram->stats.same_pages); - goto out; - } + same_filled = page_same_filled(mem, &element); kunmap_local(mem); + if (same_filled) + return write_same_filled_page(zram, element, index); compress_again: zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]); - src = kmap_local_page(page); - ret = zcomp_compress(zstrm, src, &comp_len); - kunmap_local(src); + mem = kmap_local_page(page); + ret = zcomp_compress(zram->comps[ZRAM_PRIMARY_COMP], zstrm, + mem, &comp_len); + kunmap_local(mem); if (unlikely(ret)) { zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); @@ -1439,8 +1742,11 @@ compress_again: return ret; } - if (comp_len >= huge_class_size) - comp_len = PAGE_SIZE; + if (comp_len >= huge_class_size) { + zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); + return write_incompressible_page(zram, page, index); + } + /* * handle allocation has 2 paths: * a) fast path is executed with preemption disabled (for @@ -1456,35 +1762,23 @@ compress_again: */ if (IS_ERR_VALUE(handle)) handle = zs_malloc(zram->mem_pool, comp_len, - __GFP_KSWAPD_RECLAIM | - __GFP_NOWARN | - __GFP_HIGHMEM | - __GFP_MOVABLE); + __GFP_KSWAPD_RECLAIM | + __GFP_NOWARN | + __GFP_HIGHMEM | + __GFP_MOVABLE); if (IS_ERR_VALUE(handle)) { zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); atomic64_inc(&zram->stats.writestall); handle = zs_malloc(zram->mem_pool, comp_len, - GFP_NOIO | __GFP_HIGHMEM | - __GFP_MOVABLE); + GFP_NOIO | __GFP_HIGHMEM | + __GFP_MOVABLE); if (IS_ERR_VALUE(handle)) return PTR_ERR((void *)handle); - if (comp_len != PAGE_SIZE) - goto compress_again; - /* - * If the page is not compressible, you need to acquire the - * lock and execute the code below. The zcomp_stream_get() - * call is needed to disable the cpu hotplug and grab the - * zstrm buffer back. It is necessary that the dereferencing - * of the zstrm variable below occurs correctly. - */ - zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]); + goto compress_again; } - alloced_pages = zs_get_total_pages(zram->mem_pool); - update_used_max(zram, alloced_pages); - - if (zram->limit_pages && alloced_pages > zram->limit_pages) { + if (!zram_can_store_page(zram)) { zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); zs_free(zram->mem_pool, handle); return -ENOMEM; @@ -1492,41 +1786,19 @@ compress_again: dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); - src = zstrm->buffer; - if (comp_len == PAGE_SIZE) - src = kmap_local_page(page); - memcpy(dst, src, comp_len); - if (comp_len == PAGE_SIZE) - kunmap_local(src); - + memcpy(dst, zstrm->buffer, comp_len); zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); zs_unmap_object(zram->mem_pool, handle); - atomic64_add(comp_len, &zram->stats.compr_data_size); -out: - /* - * Free memory associated with this sector - * before overwriting unused sectors. - */ - zram_slot_lock(zram, index); - zram_free_page(zram, index); - if (comp_len == PAGE_SIZE) { - zram_set_flag(zram, index, ZRAM_HUGE); - atomic64_inc(&zram->stats.huge_pages); - atomic64_inc(&zram->stats.huge_pages_since); - } - - if (flags) { - zram_set_flag(zram, index, flags); - zram_set_element(zram, index, element); - } else { - zram_set_handle(zram, index, handle); - zram_set_obj_size(zram, index, comp_len); - } + zram_slot_lock(zram, index); + zram_set_handle(zram, index, handle); + zram_set_obj_size(zram, index, comp_len); zram_slot_unlock(zram, index); /* Update stats */ atomic64_inc(&zram->stats.pages_stored); + atomic64_add(comp_len, &zram->stats.compr_data_size); + return ret; } @@ -1560,6 +1832,52 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, } #ifdef CONFIG_ZRAM_MULTI_COMP +#define RECOMPRESS_IDLE (1 << 0) +#define RECOMPRESS_HUGE (1 << 1) + +static int scan_slots_for_recompress(struct zram *zram, u32 mode, + struct zram_pp_ctl *ctl) +{ + unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; + struct zram_pp_slot *pps = NULL; + unsigned long index; + + for (index = 0; index < nr_pages; index++) { + if (!pps) + pps = kmalloc(sizeof(*pps), GFP_KERNEL); + if (!pps) + return -ENOMEM; + + INIT_LIST_HEAD(&pps->entry); + + zram_slot_lock(zram, index); + if (!zram_allocated(zram, index)) + goto next; + + if (mode & RECOMPRESS_IDLE && + !zram_test_flag(zram, index, ZRAM_IDLE)) + goto next; + + if (mode & RECOMPRESS_HUGE && + !zram_test_flag(zram, index, ZRAM_HUGE)) + goto next; + + if (zram_test_flag(zram, index, ZRAM_WB) || + zram_test_flag(zram, index, ZRAM_SAME) || + zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) + goto next; + + pps->index = index; + place_pp_slot(zram, ctl, pps); + pps = NULL; +next: + zram_slot_unlock(zram, index); + } + + kfree(pps); + return 0; +} + /* * This function will decompress (unless it's ZRAM_HUGE) the page and then * attempt to compress it using provided compression algorithm priority @@ -1567,8 +1885,9 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, * * Corresponding ZRAM slot should be locked. */ -static int zram_recompress(struct zram *zram, u32 index, struct page *page, - u32 threshold, u32 prio, u32 prio_max) +static int recompress_slot(struct zram *zram, u32 index, struct page *page, + u64 *num_recomp_pages, u32 threshold, u32 prio, + u32 prio_max) { struct zcomp_strm *zstrm = NULL; unsigned long handle_old; @@ -1596,6 +1915,13 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, if (ret) return ret; + /* + * We touched this entry so mark it as non-IDLE. This makes sure that + * we don't preserve IDLE flag and don't incorrectly pick this entry + * for different post-processing type (e.g. writeback). + */ + zram_clear_flag(zram, index, ZRAM_IDLE); + class_index_old = zs_lookup_class_index(zram->mem_pool, comp_len_old); /* * Iterate the secondary comp algorithms list (in order of priority) @@ -1615,7 +1941,8 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, num_recomps++; zstrm = zcomp_stream_get(zram->comps[prio]); src = kmap_local_page(page); - ret = zcomp_compress(zstrm, src, &comp_len_new); + ret = zcomp_compress(zram->comps[prio], zstrm, + src, &comp_len_new); kunmap_local(src); if (ret) { @@ -1645,6 +1972,15 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, if (!zstrm) return 0; + /* + * Decrement the limit (if set) on pages we can recompress, even + * when current recompression was unsuccessful or did not compress + * the page below the threshold, because we still spent resources + * on it. + */ + if (*num_recomp_pages) + *num_recomp_pages -= 1; + if (class_index_new >= class_index_old) { /* * Secondary algorithms failed to re-compress the page @@ -1699,19 +2035,17 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, return 0; } -#define RECOMPRESS_IDLE (1 << 0) -#define RECOMPRESS_HUGE (1 << 1) - static ssize_t recompress_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { u32 prio = ZRAM_SECONDARY_COMP, prio_max = ZRAM_MAX_COMPS; struct zram *zram = dev_to_zram(dev); - unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; char *args, *param, *val, *algo = NULL; + u64 num_recomp_pages = ULLONG_MAX; + struct zram_pp_ctl *ctl = NULL; + struct zram_pp_slot *pps; u32 mode = 0, threshold = 0; - unsigned long index; struct page *page; ssize_t ret; @@ -1732,6 +2066,17 @@ static ssize_t recompress_store(struct device *dev, continue; } + if (!strcmp(param, "max_pages")) { + /* + * Limit the number of entries (pages) we attempt to + * recompress. + */ + ret = kstrtoull(val, 10, &num_recomp_pages); + if (ret) + return ret; + continue; + } + if (!strcmp(param, "threshold")) { /* * We will re-compress only idle objects equal or @@ -1747,6 +2092,18 @@ static ssize_t recompress_store(struct device *dev, algo = val; continue; } + + if (!strcmp(param, "priority")) { + ret = kstrtouint(val, 10, &prio); + if (ret) + return ret; + + if (prio == ZRAM_PRIMARY_COMP) + prio = ZRAM_SECONDARY_COMP; + + prio_max = min(prio + 1, ZRAM_MAX_COMPS); + continue; + } } if (threshold >= huge_class_size) @@ -1758,6 +2115,12 @@ static ssize_t recompress_store(struct device *dev, goto release_init_lock; } + /* Do not permit concurrent post-processing actions. */ + if (atomic_xchg(&zram->pp_in_progress, 1)) { + up_read(&zram->init_lock); + return -EAGAIN; + } + if (algo) { bool found = false; @@ -1784,33 +2147,32 @@ static ssize_t recompress_store(struct device *dev, goto release_init_lock; } - ret = len; - for (index = 0; index < nr_pages; index++) { - int err = 0; - - zram_slot_lock(zram, index); + ctl = init_pp_ctl(); + if (!ctl) { + ret = -ENOMEM; + goto release_init_lock; + } - if (!zram_allocated(zram, index)) - goto next; + scan_slots_for_recompress(zram, mode, ctl); - if (mode & RECOMPRESS_IDLE && - !zram_test_flag(zram, index, ZRAM_IDLE)) - goto next; + ret = len; + while ((pps = select_pp_slot(ctl))) { + int err = 0; - if (mode & RECOMPRESS_HUGE && - !zram_test_flag(zram, index, ZRAM_HUGE)) - goto next; + if (!num_recomp_pages) + break; - if (zram_test_flag(zram, index, ZRAM_WB) || - zram_test_flag(zram, index, ZRAM_UNDER_WB) || - zram_test_flag(zram, index, ZRAM_SAME) || - zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) + zram_slot_lock(zram, pps->index); + if (!zram_test_flag(zram, pps->index, ZRAM_PP_SLOT)) goto next; - err = zram_recompress(zram, index, page, threshold, + err = recompress_slot(zram, pps->index, page, + &num_recomp_pages, threshold, prio, prio_max); next: - zram_slot_unlock(zram, index); + zram_slot_unlock(zram, pps->index); + release_pp_slot(zram, pps); + if (err) { ret = err; break; @@ -1822,6 +2184,8 @@ next: __free_page(page); release_init_lock: + release_pp_ctl(zram, ctl); + atomic_set(&zram->pp_in_progress, 0); up_read(&zram->init_lock); return ret; } @@ -1966,11 +2330,20 @@ static void zram_slot_free_notify(struct block_device *bdev, zram_slot_unlock(zram, index); } +static void zram_comp_params_reset(struct zram *zram) +{ + u32 prio; + + for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) { + comp_params_reset(zram, prio); + } +} + static void zram_destroy_comps(struct zram *zram) { u32 prio; - for (prio = 0; prio < ZRAM_MAX_COMPS; prio++) { + for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) { struct zcomp *comp = zram->comps[prio]; zram->comps[prio] = NULL; @@ -1979,6 +2352,15 @@ static void zram_destroy_comps(struct zram *zram) zcomp_destroy(comp); zram->num_active_comps--; } + + for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) { + /* Do not free statically defined compression algorithms */ + if (zram->comp_algs[prio] != default_compressor) + kfree(zram->comp_algs[prio]); + zram->comp_algs[prio] = NULL; + } + + zram_comp_params_reset(zram); } static void zram_reset_device(struct zram *zram) @@ -1987,11 +2369,6 @@ static void zram_reset_device(struct zram *zram) zram->limit_pages = 0; - if (!init_done(zram)) { - up_write(&zram->init_lock); - return; - } - set_capacity_and_notify(zram->disk, 0); part_stat_set_all(zram->disk->part0, 0); @@ -2000,6 +2377,7 @@ static void zram_reset_device(struct zram *zram) zram->disksize = 0; zram_destroy_comps(zram); memset(&zram->stats, 0, sizeof(zram->stats)); + atomic_set(&zram->pp_in_progress, 0); reset_bdev(zram); comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor); @@ -2032,11 +2410,12 @@ static ssize_t disksize_store(struct device *dev, goto out_unlock; } - for (prio = 0; prio < ZRAM_MAX_COMPS; prio++) { + for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) { if (!zram->comp_algs[prio]) continue; - comp = zcomp_create(zram->comp_algs[prio]); + comp = zcomp_create(zram->comp_algs[prio], + &zram->params[prio]); if (IS_ERR(comp)) { pr_err("Cannot initialise %s compressing backend\n", zram->comp_algs[prio]); @@ -2139,6 +2518,7 @@ static DEVICE_ATTR_RW(writeback_limit_enable); static DEVICE_ATTR_RW(recomp_algorithm); static DEVICE_ATTR_WO(recompress); #endif +static DEVICE_ATTR_WO(algorithm_params); static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, @@ -2166,6 +2546,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_recomp_algorithm.attr, &dev_attr_recompress.attr, #endif + &dev_attr_algorithm_params.attr, NULL, }; @@ -2177,6 +2558,30 @@ ATTRIBUTE_GROUPS(zram_disk); */ static int zram_add(void) { + struct queue_limits lim = { + .logical_block_size = ZRAM_LOGICAL_BLOCK_SIZE, + /* + * To ensure that we always get PAGE_SIZE aligned and + * n*PAGE_SIZED sized I/O requests. + */ + .physical_block_size = PAGE_SIZE, + .io_min = PAGE_SIZE, + .io_opt = PAGE_SIZE, + .max_hw_discard_sectors = UINT_MAX, + /* + * zram_bio_discard() will clear all logical blocks if logical + * block size is identical with physical block size(PAGE_SIZE). + * But if it is different, we will skip discarding some parts of + * logical blocks in the part of the request range which isn't + * aligned to physical block size. So we can't ensure that all + * discarded logical blocks are zeroed. + */ +#if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE + .max_write_zeroes_sectors = UINT_MAX, +#endif + .features = BLK_FEAT_STABLE_WRITES | + BLK_FEAT_SYNCHRONOUS, + }; struct zram *zram; int ret, device_id; @@ -2195,11 +2600,11 @@ static int zram_add(void) #endif /* gendisk structure */ - zram->disk = blk_alloc_disk(NUMA_NO_NODE); - if (!zram->disk) { + zram->disk = blk_alloc_disk(&lim, NUMA_NO_NODE); + if (IS_ERR(zram->disk)) { pr_err("Error allocating disk structure for device %d\n", device_id); - ret = -ENOMEM; + ret = PTR_ERR(zram->disk); goto out_free_idr; } @@ -2210,42 +2615,16 @@ static int zram_add(void) zram->disk->fops = &zram_devops; zram->disk->private_data = zram; snprintf(zram->disk->disk_name, 16, "zram%d", device_id); + atomic_set(&zram->pp_in_progress, 0); + zram_comp_params_reset(zram); + comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor); /* Actual capacity set using sysfs (/sys/block/zram<id>/disksize */ set_capacity(zram->disk, 0); - /* zram devices sort of resembles non-rotational disks */ - blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue); - blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue); - - /* - * To ensure that we always get PAGE_SIZE aligned - * and n*PAGE_SIZED sized I/O requests. - */ - blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE); - blk_queue_logical_block_size(zram->disk->queue, - ZRAM_LOGICAL_BLOCK_SIZE); - blk_queue_io_min(zram->disk->queue, PAGE_SIZE); - blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); - blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX); - - /* - * zram_bio_discard() will clear all logical blocks if logical block - * size is identical with physical block size(PAGE_SIZE). But if it is - * different, we will skip discarding some parts of logical blocks in - * the part of the request range which isn't aligned to physical block - * size. So we can't ensure that all discarded logical blocks are - * zeroed. - */ - if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) - blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); - - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue); ret = device_add_disk(NULL, zram->disk, zram_disk_groups); if (ret) goto out_cleanup_disk; - comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor); - zram_debugfs_register(zram); pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; @@ -2394,9 +2773,10 @@ static void destroy_devices(void) static int __init zram_init(void) { + struct zram_table_entry zram_te; int ret; - BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > BITS_PER_LONG); + BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > sizeof(zram_te.flags) * 8); ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare", zcomp_cpu_up_prepare, zcomp_cpu_dead); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 3b94d12f41b4..db78d7c01b9a 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -45,11 +45,9 @@ /* Flags for zram pages (table[page_no].flags) */ enum zram_pageflags { - /* zram slot is locked */ - ZRAM_LOCK = ZRAM_FLAG_SHIFT, - ZRAM_SAME, /* Page consists the same element */ + ZRAM_SAME = ZRAM_FLAG_SHIFT, /* Page consists the same element */ ZRAM_WB, /* page is stored on backing_device */ - ZRAM_UNDER_WB, /* page is under writeback */ + ZRAM_PP_SLOT, /* Selected for post-processing */ ZRAM_HUGE, /* Incompressible page */ ZRAM_IDLE, /* not accessed page since last idle marking */ ZRAM_INCOMPRESSIBLE, /* none of the algorithms could compress it */ @@ -64,11 +62,9 @@ enum zram_pageflags { /* Allocated for each disk page */ struct zram_table_entry { - union { - unsigned long handle; - unsigned long element; - }; - unsigned long flags; + unsigned long handle; + unsigned int flags; + spinlock_t lock; #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME ktime_t ac_time; #endif @@ -107,6 +103,7 @@ struct zram { struct zram_table_entry *table; struct zs_pool *mem_pool; struct zcomp *comps[ZRAM_MAX_COMPS]; + struct zcomp_params params[ZRAM_MAX_COMPS]; struct gendisk *disk; /* Prevent concurrent execution of device init */ struct rw_semaphore init_lock; @@ -132,12 +129,13 @@ struct zram { spinlock_t wb_limit_lock; bool wb_limit_enable; u64 bd_wb_limit; - struct bdev_handle *bdev_handle; + struct block_device *bdev; unsigned long *bitmap; unsigned long nr_pages; #endif #ifdef CONFIG_ZRAM_MEMORY_TRACKING struct dentry *debugfs_dir; #endif + atomic_t pp_in_progress; }; #endif |