diff options
Diffstat (limited to 'drivers/block/brd.c')
| -rw-r--r-- | drivers/block/brd.c | 585 |
1 files changed, 241 insertions, 344 deletions
diff --git a/drivers/block/brd.c b/drivers/block/brd.c index c18586fccb6f..9778259b30d4 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Ram backed block device driver. * @@ -17,314 +18,209 @@ #include <linux/bio.h> #include <linux/highmem.h> #include <linux/mutex.h> -#include <linux/radix-tree.h> +#include <linux/pagemap.h> +#include <linux/xarray.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/backing-dev.h> +#include <linux/debugfs.h> #include <linux/uaccess.h> -#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) -#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) - /* - * Each block ramdisk device has a radix_tree brd_pages of pages that stores - * the pages containing the block device's contents. A brd page's ->index is - * its offset in PAGE_SIZE units. This is similar to, but in no way connected - * with, the kernel's pagecache or buffer cache (which sit above our block - * device). + * Each block ramdisk device has a xarray brd_pages of pages that stores + * the pages containing the block device's contents. */ struct brd_device { - int brd_number; - - struct request_queue *brd_queue; + int brd_number; struct gendisk *brd_disk; struct list_head brd_list; /* - * Backing store of pages and lock to protect it. This is the contents - * of the block device. + * Backing store of pages. This is the contents of the block device. */ - spinlock_t brd_lock; - struct radix_tree_root brd_pages; + struct xarray brd_pages; + u64 brd_nr_pages; }; /* - * Look up and return a brd's page for a given sector. + * Look up and return a brd's page with reference grabbed for a given sector. */ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) { - pgoff_t idx; struct page *page; + XA_STATE(xas, &brd->brd_pages, sector >> PAGE_SECTORS_SHIFT); - /* - * The page lifetime is protected by the fact that we have opened the - * device node -- brd pages will never be deleted under us, so we - * don't need any further locking or refcounting. - * - * This is strictly true for the radix-tree nodes as well (ie. we - * don't actually need the rcu_read_lock()), however that is not a - * documented feature of the radix-tree API so it is better to be - * safe here (we don't have total exclusion from radix tree updates - * here, only deletes). - */ rcu_read_lock(); - idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */ - page = radix_tree_lookup(&brd->brd_pages, idx); - rcu_read_unlock(); +repeat: + page = xas_load(&xas); + if (xas_retry(&xas, page)) { + xas_reset(&xas); + goto repeat; + } + + if (!page) + goto out; - BUG_ON(page && page->index != idx); + if (!get_page_unless_zero(page)) { + xas_reset(&xas); + goto repeat; + } + + if (unlikely(page != xas_reload(&xas))) { + put_page(page); + xas_reset(&xas); + goto repeat; + } +out: + rcu_read_unlock(); return page; } /* - * Look up and return a brd's page for a given sector. - * If one does not exist, allocate an empty page, and insert that. Then - * return it. + * Insert a new page for a given sector, if one does not already exist. + * The returned page will grab reference. */ -static struct page *brd_insert_page(struct brd_device *brd, sector_t sector) +static struct page *brd_insert_page(struct brd_device *brd, sector_t sector, + blk_opf_t opf) { - pgoff_t idx; - struct page *page; - gfp_t gfp_flags; + gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO; + struct page *page, *ret; - page = brd_lookup_page(brd, sector); - if (page) - return page; - - /* - * Must use NOIO because we don't want to recurse back into the - * block or filesystem layers from page reclaim. - * - * Cannot support DAX and highmem, because our ->direct_access - * routine for DAX must return memory that is always addressable. - * If DAX was reworked to use pfns and kmap throughout, this - * restriction might be able to be lifted. - */ - gfp_flags = GFP_NOIO | __GFP_ZERO; - page = alloc_page(gfp_flags); + page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM); if (!page) - return NULL; - - if (radix_tree_preload(GFP_NOIO)) { - __free_page(page); - return NULL; + return ERR_PTR(-ENOMEM); + + xa_lock(&brd->brd_pages); + ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL, + page, gfp); + if (!ret) { + brd->brd_nr_pages++; + get_page(page); + xa_unlock(&brd->brd_pages); + return page; } - spin_lock(&brd->brd_lock); - idx = sector >> PAGE_SECTORS_SHIFT; - page->index = idx; - if (radix_tree_insert(&brd->brd_pages, idx, page)) { - __free_page(page); - page = radix_tree_lookup(&brd->brd_pages, idx); - BUG_ON(!page); - BUG_ON(page->index != idx); + if (!xa_is_err(ret)) { + get_page(ret); + xa_unlock(&brd->brd_pages); + put_page(page); + return ret; } - spin_unlock(&brd->brd_lock); - radix_tree_preload_end(); - - return page; + xa_unlock(&brd->brd_pages); + put_page(page); + return ERR_PTR(xa_err(ret)); } /* - * Free all backing store pages and radix tree. This must only be called when + * Free all backing store pages and xarray. This must only be called when * there are no other users of the device. */ -#define FREE_BATCH 16 static void brd_free_pages(struct brd_device *brd) { - unsigned long pos = 0; - struct page *pages[FREE_BATCH]; - int nr_pages; - - do { - int i; - - nr_pages = radix_tree_gang_lookup(&brd->brd_pages, - (void **)pages, pos, FREE_BATCH); - - for (i = 0; i < nr_pages; i++) { - void *ret; - - BUG_ON(pages[i]->index < pos); - pos = pages[i]->index; - ret = radix_tree_delete(&brd->brd_pages, pos); - BUG_ON(!ret || ret != pages[i]); - __free_page(pages[i]); - } - - pos++; - - /* - * This assumes radix_tree_gang_lookup always returns as - * many pages as possible. If the radix-tree code changes, - * so will this have to. - */ - } while (nr_pages == FREE_BATCH); -} + struct page *page; + pgoff_t idx; -/* - * copy_to_brd_setup must be called before copy_to_brd. It may sleep. - */ -static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n) -{ - unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; - size_t copy; - - copy = min_t(size_t, n, PAGE_SIZE - offset); - if (!brd_insert_page(brd, sector)) - return -ENOSPC; - if (copy < n) { - sector += copy >> SECTOR_SHIFT; - if (!brd_insert_page(brd, sector)) - return -ENOSPC; + xa_for_each(&brd->brd_pages, idx, page) { + put_page(page); + cond_resched(); } - return 0; + + xa_destroy(&brd->brd_pages); } /* - * Copy n bytes from src to the brd starting at sector. Does not sleep. + * Process a single segment. The segment is capped to not cross page boundaries + * in both the bio and the brd backing memory. */ -static void copy_to_brd(struct brd_device *brd, const void *src, - sector_t sector, size_t n) +static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio) { + struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter); + sector_t sector = bio->bi_iter.bi_sector; + u32 offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT; + blk_opf_t opf = bio->bi_opf; struct page *page; - void *dst; - unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; - size_t copy; + void *kaddr; + + bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset); - copy = min_t(size_t, n, PAGE_SIZE - offset); page = brd_lookup_page(brd, sector); - BUG_ON(!page); - - dst = kmap_atomic(page); - memcpy(dst + offset, src, copy); - kunmap_atomic(dst); - - if (copy < n) { - src += copy; - sector += copy >> SECTOR_SHIFT; - copy = n - copy; - page = brd_lookup_page(brd, sector); - BUG_ON(!page); - - dst = kmap_atomic(page); - memcpy(dst, src, copy); - kunmap_atomic(dst); + if (!page && op_is_write(opf)) { + page = brd_insert_page(brd, sector, opf); + if (IS_ERR(page)) + goto out_error; } + + kaddr = bvec_kmap_local(&bv); + if (op_is_write(opf)) { + memcpy_to_page(page, offset, kaddr, bv.bv_len); + } else { + if (page) + memcpy_from_page(kaddr, page, offset, bv.bv_len); + else + memset(kaddr, 0, bv.bv_len); + } + kunmap_local(kaddr); + + bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len); + if (page) + put_page(page); + return true; + +out_error: + if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT)) + bio_wouldblock_error(bio); + else + bio_io_error(bio); + return false; } -/* - * Copy n bytes to dst from the brd starting at sector. Does not sleep. - */ -static void copy_from_brd(void *dst, struct brd_device *brd, - sector_t sector, size_t n) +static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size) { + sector_t aligned_sector = round_up(sector, PAGE_SECTORS); + sector_t aligned_end = round_down( + sector + (size >> SECTOR_SHIFT), PAGE_SECTORS); struct page *page; - void *src; - unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; - size_t copy; - copy = min_t(size_t, n, PAGE_SIZE - offset); - page = brd_lookup_page(brd, sector); - if (page) { - src = kmap_atomic(page); - memcpy(dst, src + offset, copy); - kunmap_atomic(src); - } else - memset(dst, 0, copy); - - if (copy < n) { - dst += copy; - sector += copy >> SECTOR_SHIFT; - copy = n - copy; - page = brd_lookup_page(brd, sector); + if (aligned_end <= aligned_sector) + return; + + xa_lock(&brd->brd_pages); + while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) { + page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT); if (page) { - src = kmap_atomic(page); - memcpy(dst, src, copy); - kunmap_atomic(src); - } else - memset(dst, 0, copy); + put_page(page); + brd->brd_nr_pages--; + } + aligned_sector += PAGE_SECTORS; } + xa_unlock(&brd->brd_pages); } -/* - * Process a single bvec of a bio. - */ -static int brd_do_bvec(struct brd_device *brd, struct page *page, - unsigned int len, unsigned int off, unsigned int op, - sector_t sector) +static void brd_submit_bio(struct bio *bio) { - void *mem; - int err = 0; - - if (op_is_write(op)) { - err = copy_to_brd_setup(brd, sector, len); - if (err) - goto out; - } + struct brd_device *brd = bio->bi_bdev->bd_disk->private_data; - mem = kmap_atomic(page); - if (!op_is_write(op)) { - copy_from_brd(mem + off, brd, sector, len); - flush_dcache_page(page); - } else { - flush_dcache_page(page); - copy_to_brd(brd, mem + off, sector, len); + if (unlikely(op_is_discard(bio->bi_opf))) { + brd_do_discard(brd, bio->bi_iter.bi_sector, + bio->bi_iter.bi_size); + bio_endio(bio); + return; } - kunmap_atomic(mem); -out: - return err; -} - -static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) -{ - struct brd_device *brd = bio->bi_disk->private_data; - struct bio_vec bvec; - sector_t sector; - struct bvec_iter iter; - - sector = bio->bi_iter.bi_sector; - if (bio_end_sector(bio) > get_capacity(bio->bi_disk)) - goto io_error; - - bio_for_each_segment(bvec, bio, iter) { - unsigned int len = bvec.bv_len; - int err; - - err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset, - bio_op(bio), sector); - if (err) - goto io_error; - sector += len >> SECTOR_SHIFT; - } + do { + if (!brd_rw_bvec(brd, bio)) + return; + } while (bio->bi_iter.bi_size); bio_endio(bio); - return BLK_QC_T_NONE; -io_error: - bio_io_error(bio); - return BLK_QC_T_NONE; -} - -static int brd_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, unsigned int op) -{ - struct brd_device *brd = bdev->bd_disk->private_data; - int err; - - if (PageTransHuge(page)) - return -ENOTSUPP; - err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector); - page_endio(page, op_is_write(op), err); - return err; } static const struct block_device_operations brd_fops = { .owner = THIS_MODULE, - .rw_page = brd_rw_page, + .submit_bio = brd_submit_bio, }; /* @@ -342,6 +238,7 @@ static int max_part = 1; module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); +MODULE_DESCRIPTION("Ram backed block device driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); MODULE_ALIAS("rd"); @@ -362,116 +259,139 @@ __setup("ramdisk_size=", ramdisk_size); */ static LIST_HEAD(brd_devices); static DEFINE_MUTEX(brd_devices_mutex); +static struct dentry *brd_debugfs_dir; -static struct brd_device *brd_alloc(int i) +static struct brd_device *brd_find_or_alloc_device(int i) { struct brd_device *brd; - struct gendisk *disk; - brd = kzalloc(sizeof(*brd), GFP_KERNEL); - if (!brd) - goto out; - brd->brd_number = i; - spin_lock_init(&brd->brd_lock); - INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC); + mutex_lock(&brd_devices_mutex); + list_for_each_entry(brd, &brd_devices, brd_list) { + if (brd->brd_number == i) { + mutex_unlock(&brd_devices_mutex); + return ERR_PTR(-EEXIST); + } + } - brd->brd_queue = blk_alloc_queue(GFP_KERNEL); - if (!brd->brd_queue) - goto out_free_dev; + brd = kzalloc(sizeof(*brd), GFP_KERNEL); + if (!brd) { + mutex_unlock(&brd_devices_mutex); + return ERR_PTR(-ENOMEM); + } + brd->brd_number = i; + list_add_tail(&brd->brd_list, &brd_devices); + mutex_unlock(&brd_devices_mutex); + return brd; +} - blk_queue_make_request(brd->brd_queue, brd_make_request); - blk_queue_max_hw_sectors(brd->brd_queue, 1024); +static void brd_free_device(struct brd_device *brd) +{ + mutex_lock(&brd_devices_mutex); + list_del(&brd->brd_list); + mutex_unlock(&brd_devices_mutex); + kfree(brd); +} - /* This is so fdisk will align partitions on 4k, because of - * direct_access API needing 4k alignment, returning a PFN - * (This is only a problem on very small devices <= 4M, - * otherwise fdisk will align on 1M. Regardless this call - * is harmless) - */ - blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE); - disk = brd->brd_disk = alloc_disk(max_part); - if (!disk) - goto out_free_queue; +static int brd_alloc(int i) +{ + struct brd_device *brd; + struct gendisk *disk; + char buf[DISK_NAME_LEN]; + int err = -ENOMEM; + struct queue_limits lim = { + /* + * This is so fdisk will align partitions on 4k, because of + * direct_access API needing 4k alignment, returning a PFN + * (This is only a problem on very small devices <= 4M, + * otherwise fdisk will align on 1M. Regardless this call + * is harmless) + */ + .physical_block_size = PAGE_SIZE, + .max_hw_discard_sectors = UINT_MAX, + .max_discard_segments = 1, + .discard_granularity = PAGE_SIZE, + .features = BLK_FEAT_SYNCHRONOUS | + BLK_FEAT_NOWAIT, + }; + + brd = brd_find_or_alloc_device(i); + if (IS_ERR(brd)) + return PTR_ERR(brd); + + xa_init(&brd->brd_pages); + + snprintf(buf, DISK_NAME_LEN, "ram%d", i); + if (!IS_ERR_OR_NULL(brd_debugfs_dir)) + debugfs_create_u64(buf, 0444, brd_debugfs_dir, + &brd->brd_nr_pages); + + disk = brd->brd_disk = blk_alloc_disk(&lim, NUMA_NO_NODE); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); + goto out_free_dev; + } disk->major = RAMDISK_MAJOR; disk->first_minor = i * max_part; + disk->minors = max_part; disk->fops = &brd_fops; disk->private_data = brd; - disk->flags = GENHD_FL_EXT_DEVT; - sprintf(disk->disk_name, "ram%d", i); + strscpy(disk->disk_name, buf, DISK_NAME_LEN); set_capacity(disk, rd_size * 2); - brd->brd_queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; + + err = add_disk(disk); + if (err) + goto out_cleanup_disk; - /* Tell the block layer that this is not a rotational device */ - blk_queue_flag_set(QUEUE_FLAG_NONROT, brd->brd_queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, brd->brd_queue); - - return brd; + return 0; -out_free_queue: - blk_cleanup_queue(brd->brd_queue); +out_cleanup_disk: + put_disk(disk); out_free_dev: - kfree(brd); -out: - return NULL; + brd_free_device(brd); + return err; } -static void brd_free(struct brd_device *brd) +static void brd_probe(dev_t dev) { - put_disk(brd->brd_disk); - blk_cleanup_queue(brd->brd_queue); - brd_free_pages(brd); - kfree(brd); + brd_alloc(MINOR(dev) / max_part); } -static struct brd_device *brd_init_one(int i, bool *new) +static void brd_cleanup(void) { - struct brd_device *brd; + struct brd_device *brd, *next; - *new = false; - list_for_each_entry(brd, &brd_devices, brd_list) { - if (brd->brd_number == i) - goto out; - } + debugfs_remove_recursive(brd_debugfs_dir); - brd = brd_alloc(i); - if (brd) { - brd->brd_disk->queue = brd->brd_queue; - add_disk(brd->brd_disk); - list_add_tail(&brd->brd_list, &brd_devices); + list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { + del_gendisk(brd->brd_disk); + put_disk(brd->brd_disk); + brd_free_pages(brd); + brd_free_device(brd); } - *new = true; -out: - return brd; -} - -static void brd_del_one(struct brd_device *brd) -{ - list_del(&brd->brd_list); - del_gendisk(brd->brd_disk); - brd_free(brd); } -static struct kobject *brd_probe(dev_t dev, int *part, void *data) +static inline void brd_check_and_reset_par(void) { - struct brd_device *brd; - struct kobject *kobj; - bool new; - - mutex_lock(&brd_devices_mutex); - brd = brd_init_one(MINOR(dev) / max_part, &new); - kobj = brd ? get_disk_and_module(brd->brd_disk) : NULL; - mutex_unlock(&brd_devices_mutex); + if (unlikely(!max_part)) + max_part = 1; - if (new) - *part = 0; + /* + * make sure 'max_part' can be divided exactly by (1U << MINORBITS), + * otherwise, it is possiable to get same dev_t when adding partitions. + */ + if ((1U << MINORBITS) % max_part != 0) + max_part = 1UL << fls(max_part); - return kobj; + if (max_part > DISK_MAX_PARTS) { + pr_info("brd: max_part can't be larger than %d, reset max_part = %d.\n", + DISK_MAX_PARTS, DISK_MAX_PARTS); + max_part = DISK_MAX_PARTS; + } } static int __init brd_init(void) { - struct brd_device *brd, *next; - int i; + int err, i; /* * brd module now has a feature to instantiate underlying device @@ -488,56 +408,33 @@ static int __init brd_init(void) * dynamically. */ - if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) - return -EIO; - - if (unlikely(!max_part)) - max_part = 1; - - for (i = 0; i < rd_nr; i++) { - brd = brd_alloc(i); - if (!brd) - goto out_free; - list_add_tail(&brd->brd_list, &brd_devices); - } + brd_check_and_reset_par(); - /* point of no return */ + brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); - list_for_each_entry(brd, &brd_devices, brd_list) { - /* - * associate with queue just before adding disk for - * avoiding to mess up failure path - */ - brd->brd_disk->queue = brd->brd_queue; - add_disk(brd->brd_disk); + if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) { + err = -EIO; + goto out_free; } - blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS, - THIS_MODULE, brd_probe, NULL, NULL); + for (i = 0; i < rd_nr; i++) + brd_alloc(i); pr_info("brd: module loaded\n"); return 0; out_free: - list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { - list_del(&brd->brd_list); - brd_free(brd); - } - unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); + brd_cleanup(); pr_info("brd: module NOT loaded !!!\n"); - return -ENOMEM; + return err; } static void __exit brd_exit(void) { - struct brd_device *brd, *next; - - list_for_each_entry_safe(brd, next, &brd_devices, brd_list) - brd_del_one(brd); - blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS); unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); + brd_cleanup(); pr_info("brd: module unloaded\n"); } |
