diff options
Diffstat (limited to 'drivers/block/loop.c')
| -rw-r--r-- | drivers/block/loop.c | 2282 |
1 files changed, 1178 insertions, 1104 deletions
diff --git a/drivers/block/loop.c b/drivers/block/loop.c index cf5538942834..272bc608e528 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1,64 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * linux/drivers/block/loop.c - * - * Written by Theodore Ts'o, 3/29/93 - * - * Copyright 1993 by Theodore Ts'o. Redistribution of this file is - * permitted under the GNU General Public License. - * - * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993 - * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996 - * - * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994 - * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996 - * - * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997 - * - * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998 - * - * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998 - * - * Loadable modules and other fixes by AK, 1998 - * - * Make real block number available to downstream transfer functions, enables - * CBC (and relatives) mode encryption requiring unique IVs per data block. - * Reed H. Petty, rhp@draper.net - * - * Maximum number of loop devices now dynamic via max_loop module parameter. - * Russell Kroll <rkroll@exploits.org> 19990701 - * - * Maximum number of loop devices when compiled-in now selectable by passing - * max_loop=<1-255> to the kernel on boot. - * Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999 - * - * Completely rewrite request handling to be make_request_fn style and - * non blocking, pushing work to a helper thread. Lots of fixes from - * Al Viro too. - * Jens Axboe <axboe@suse.de>, Nov 2000 - * - * Support up to 256 loop devices - * Heinz Mauelshagen <mge@sistina.com>, Feb 2002 - * - * Support for falling back on the write file operation when the address space - * operations write_begin is not available on the backing filesystem. - * Anton Altaparmakov, 16 Feb 2005 - * - * Still To Fix: - * - Advisory locking is ignored here. - * - Should use an own CAP_* category instead of CAP_SYS_ADMIN - * + * Copyright 1993 by Theodore Ts'o. */ - #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/fs.h> +#include <linux/pagemap.h> #include <linux/file.h> #include <linux/stat.h> #include <linux/errno.h> #include <linux/major.h> #include <linux/wait.h> -#include <linux/blkdev.h> #include <linux/blkpg.h> #include <linux/init.h> #include <linux/swap.h> @@ -70,7 +23,6 @@ #include <linux/writeback.h> #include <linux/completion.h> #include <linux/highmem.h> -#include <linux/kthread.h> #include <linux/splice.h> #include <linux/sysfs.h> #include <linux/miscdevice.h> @@ -78,83 +30,142 @@ #include <linux/uio.h> #include <linux/ioprio.h> #include <linux/blk-cgroup.h> +#include <linux/sched/mm.h> +#include <linux/statfs.h> +#include <linux/uaccess.h> +#include <linux/blk-mq.h> +#include <linux/spinlock.h> +#include <uapi/linux/loop.h> + +/* Possible states of device */ +enum { + Lo_unbound, + Lo_bound, + Lo_rundown, + Lo_deleting, +}; -#include "loop.h" +struct loop_device { + int lo_number; + loff_t lo_offset; + loff_t lo_sizelimit; + int lo_flags; + char lo_file_name[LO_NAME_SIZE]; + + struct file *lo_backing_file; + unsigned int lo_min_dio_size; + struct block_device *lo_device; + + gfp_t old_gfp_mask; + + spinlock_t lo_lock; + int lo_state; + spinlock_t lo_work_lock; + struct workqueue_struct *workqueue; + struct work_struct rootcg_work; + struct list_head rootcg_cmd_list; + struct list_head idle_worker_list; + struct rb_root worker_tree; + struct timer_list timer; + bool sysfs_inited; + + struct request_queue *lo_queue; + struct blk_mq_tag_set tag_set; + struct gendisk *lo_disk; + struct mutex lo_mutex; + bool idr_visible; +}; -#include <linux/uaccess.h> +struct loop_cmd { + struct list_head list_entry; + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ + long ret; + struct kiocb iocb; + struct bio_vec *bvec; + struct cgroup_subsys_state *blkcg_css; + struct cgroup_subsys_state *memcg_css; +}; + +#define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) +#define LOOP_DEFAULT_HW_Q_DEPTH 128 static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_ctl_mutex); +static DEFINE_MUTEX(loop_validate_mutex); -static int max_part; -static int part_shift; - -static int transfer_xor(struct loop_device *lo, int cmd, - struct page *raw_page, unsigned raw_off, - struct page *loop_page, unsigned loop_off, - int size, sector_t real_block) +/** + * loop_global_lock_killable() - take locks for safe loop_validate_file() test + * + * @lo: struct loop_device + * @global: true if @lo is about to bind another "struct loop_device", false otherwise + * + * Returns 0 on success, -EINTR otherwise. + * + * Since loop_validate_file() traverses on other "struct loop_device" if + * is_loop_device() is true, we need a global lock for serializing concurrent + * loop_configure()/loop_change_fd()/__loop_clr_fd() calls. + */ +static int loop_global_lock_killable(struct loop_device *lo, bool global) { - char *raw_buf = kmap_atomic(raw_page) + raw_off; - char *loop_buf = kmap_atomic(loop_page) + loop_off; - char *in, *out, *key; - int i, keysize; + int err; - if (cmd == READ) { - in = raw_buf; - out = loop_buf; - } else { - in = loop_buf; - out = raw_buf; + if (global) { + err = mutex_lock_killable(&loop_validate_mutex); + if (err) + return err; } - - key = lo->lo_encrypt_key; - keysize = lo->lo_encrypt_key_size; - for (i = 0; i < size; i++) - *out++ = *in++ ^ key[(i & 511) % keysize]; - - kunmap_atomic(loop_buf); - kunmap_atomic(raw_buf); - cond_resched(); - return 0; + err = mutex_lock_killable(&lo->lo_mutex); + if (err && global) + mutex_unlock(&loop_validate_mutex); + return err; } -static int xor_init(struct loop_device *lo, const struct loop_info64 *info) +/** + * loop_global_unlock() - release locks taken by loop_global_lock_killable() + * + * @lo: struct loop_device + * @global: true if @lo was about to bind another "struct loop_device", false otherwise + */ +static void loop_global_unlock(struct loop_device *lo, bool global) { - if (unlikely(info->lo_encrypt_key_size <= 0)) - return -EINVAL; - return 0; + mutex_unlock(&lo->lo_mutex); + if (global) + mutex_unlock(&loop_validate_mutex); } -static struct loop_func_table none_funcs = { - .number = LO_CRYPT_NONE, -}; - -static struct loop_func_table xor_funcs = { - .number = LO_CRYPT_XOR, - .transfer = transfer_xor, - .init = xor_init -}; - -/* xfer_funcs[0] is special - its release function is never called */ -static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = { - &none_funcs, - &xor_funcs -}; +static int max_part; +static int part_shift; -static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file) +static loff_t lo_calculate_size(struct loop_device *lo, struct file *file) { loff_t loopsize; + int ret; + + if (S_ISBLK(file_inode(file)->i_mode)) { + loopsize = i_size_read(file->f_mapping->host); + } else { + struct kstat stat; + + /* + * Get the accurate file size. This provides better results than + * cached inode data, particularly for network filesystems where + * metadata may be stale. + */ + ret = vfs_getattr_nosec(&file->f_path, &stat, STATX_SIZE, 0); + if (ret) + return 0; + + loopsize = stat.size; + } - /* Compute loopsize in bytes */ - loopsize = i_size_read(file->f_mapping->host); - if (offset > 0) - loopsize -= offset; + if (lo->lo_offset > 0) + loopsize -= lo->lo_offset; /* offset is beyond i_size, weird but possible */ if (loopsize < 0) return 0; - - if (sizelimit > 0 && sizelimit < loopsize) - loopsize = sizelimit; + if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize) + loopsize = lo->lo_sizelimit; /* * Unfortunately, if we want to do I/O on the device, * the number of 512-byte sectors has to fit into a sector_t. @@ -162,289 +173,108 @@ static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file) return loopsize >> 9; } -static loff_t get_loop_size(struct loop_device *lo, struct file *file) -{ - return get_size(lo->lo_offset, lo->lo_sizelimit, file); -} - -static void __loop_update_dio(struct loop_device *lo, bool dio) -{ - struct file *file = lo->lo_backing_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - unsigned short sb_bsize = 0; - unsigned dio_align = 0; - bool use_dio; - - if (inode->i_sb->s_bdev) { - sb_bsize = bdev_logical_block_size(inode->i_sb->s_bdev); - dio_align = sb_bsize - 1; - } - - /* - * We support direct I/O only if lo_offset is aligned with the - * logical I/O size of backing device, and the logical block - * size of loop is bigger than the backing device's and the loop - * needn't transform transfer. - * - * TODO: the above condition may be loosed in the future, and - * direct I/O may be switched runtime at that time because most - * of requests in sane applications should be PAGE_SIZE aligned - */ - if (dio) { - if (queue_logical_block_size(lo->lo_queue) >= sb_bsize && - !(lo->lo_offset & dio_align) && - mapping->a_ops->direct_IO && - !lo->transfer) - use_dio = true; - else - use_dio = false; - } else { - use_dio = false; - } - - if (lo->use_dio == use_dio) - return; - - /* flush dirty pages before changing direct IO */ - vfs_fsync(file, 0); - - /* - * The flag of LO_FLAGS_DIRECT_IO is handled similarly with - * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup - * will get updated by ioctl(LOOP_GET_STATUS) - */ - blk_mq_freeze_queue(lo->lo_queue); - lo->use_dio = use_dio; - if (use_dio) { - blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, lo->lo_queue); - lo->lo_flags |= LO_FLAGS_DIRECT_IO; - } else { - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue); - lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; - } - blk_mq_unfreeze_queue(lo->lo_queue); -} - -static int -figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit) -{ - loff_t size = get_size(offset, sizelimit, lo->lo_backing_file); - sector_t x = (sector_t)size; - struct block_device *bdev = lo->lo_device; - - if (unlikely((loff_t)x != size)) - return -EFBIG; - if (lo->lo_offset != offset) - lo->lo_offset = offset; - if (lo->lo_sizelimit != sizelimit) - lo->lo_sizelimit = sizelimit; - set_capacity(lo->lo_disk, x); - bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9); - /* let user-space know about the new size */ - kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); - return 0; -} - -static inline int -lo_do_transfer(struct loop_device *lo, int cmd, - struct page *rpage, unsigned roffs, - struct page *lpage, unsigned loffs, - int size, sector_t rblock) -{ - int ret; - - ret = lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock); - if (likely(!ret)) - return 0; - - printk_ratelimited(KERN_ERR - "loop: Transfer error at byte offset %llu, length %i.\n", - (unsigned long long)rblock << 9, size); - return ret; -} - -static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos) -{ - struct iov_iter i; - ssize_t bw; - - iov_iter_bvec(&i, WRITE, bvec, 1, bvec->bv_len); - - file_start_write(file); - bw = vfs_iter_write(file, &i, ppos, 0); - file_end_write(file); - - if (likely(bw == bvec->bv_len)) - return 0; - - printk_ratelimited(KERN_ERR - "loop: Write error at byte offset %llu, length %i.\n", - (unsigned long long)*ppos, bvec->bv_len); - if (bw >= 0) - bw = -EIO; - return bw; -} - -static int lo_write_simple(struct loop_device *lo, struct request *rq, - loff_t pos) +/* + * We support direct I/O only if lo_offset is aligned with the logical I/O size + * of backing device, and the logical block size of loop is bigger than that of + * the backing device. + */ +static bool lo_can_use_dio(struct loop_device *lo) { - struct bio_vec bvec; - struct req_iterator iter; - int ret = 0; - - rq_for_each_segment(bvec, rq, iter) { - ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos); - if (ret < 0) - break; - cond_resched(); - } - - return ret; + if (!(lo->lo_backing_file->f_mode & FMODE_CAN_ODIRECT)) + return false; + if (queue_logical_block_size(lo->lo_queue) < lo->lo_min_dio_size) + return false; + if (lo->lo_offset & (lo->lo_min_dio_size - 1)) + return false; + return true; } /* - * This is the slow, transforming version that needs to double buffer the - * data as it cannot do the transformations in place without having direct - * access to the destination pages of the backing file. + * Direct I/O can be enabled either by using an O_DIRECT file descriptor, or by + * passing in the LO_FLAGS_DIRECT_IO flag from userspace. It will be silently + * disabled when the device block size is too small or the offset is unaligned. + * + * loop_get_status will always report the effective LO_FLAGS_DIRECT_IO flag and + * not the originally passed in one. */ -static int lo_write_transfer(struct loop_device *lo, struct request *rq, - loff_t pos) +static inline void loop_update_dio(struct loop_device *lo) { - struct bio_vec bvec, b; - struct req_iterator iter; - struct page *page; - int ret = 0; - - page = alloc_page(GFP_NOIO); - if (unlikely(!page)) - return -ENOMEM; - - rq_for_each_segment(bvec, rq, iter) { - ret = lo_do_transfer(lo, WRITE, page, 0, bvec.bv_page, - bvec.bv_offset, bvec.bv_len, pos >> 9); - if (unlikely(ret)) - break; - - b.bv_page = page; - b.bv_offset = 0; - b.bv_len = bvec.bv_len; - ret = lo_write_bvec(lo->lo_backing_file, &b, &pos); - if (ret < 0) - break; - } + lockdep_assert_held(&lo->lo_mutex); + WARN_ON_ONCE(lo->lo_state == Lo_bound && + lo->lo_queue->mq_freeze_depth == 0); - __free_page(page); - return ret; + if ((lo->lo_flags & LO_FLAGS_DIRECT_IO) && !lo_can_use_dio(lo)) + lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; } -static int lo_read_simple(struct loop_device *lo, struct request *rq, - loff_t pos) +/** + * loop_set_size() - sets device size and notifies userspace + * @lo: struct loop_device to set the size for + * @size: new size of the loop device + * + * Callers must validate that the size passed into this function fits into + * a sector_t, eg using loop_validate_size() + */ +static void loop_set_size(struct loop_device *lo, loff_t size) { - struct bio_vec bvec; - struct req_iterator iter; - struct iov_iter i; - ssize_t len; - - rq_for_each_segment(bvec, rq, iter) { - iov_iter_bvec(&i, READ, &bvec, 1, bvec.bv_len); - len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); - if (len < 0) - return len; - - flush_dcache_page(bvec.bv_page); - - if (len != bvec.bv_len) { - struct bio *bio; - - __rq_for_each_bio(bio, rq) - zero_fill_bio(bio); - break; - } - cond_resched(); - } - - return 0; + if (!set_capacity_and_notify(lo->lo_disk, size)) + kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); } -static int lo_read_transfer(struct loop_device *lo, struct request *rq, - loff_t pos) +static void loop_clear_limits(struct loop_device *lo, int mode) { - struct bio_vec bvec, b; - struct req_iterator iter; - struct iov_iter i; - struct page *page; - ssize_t len; - int ret = 0; - - page = alloc_page(GFP_NOIO); - if (unlikely(!page)) - return -ENOMEM; - - rq_for_each_segment(bvec, rq, iter) { - loff_t offset = pos; - - b.bv_page = page; - b.bv_offset = 0; - b.bv_len = bvec.bv_len; - - iov_iter_bvec(&i, READ, &b, 1, b.bv_len); - len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); - if (len < 0) { - ret = len; - goto out_free_page; - } - - ret = lo_do_transfer(lo, READ, page, 0, bvec.bv_page, - bvec.bv_offset, len, offset >> 9); - if (ret) - goto out_free_page; + struct queue_limits lim = queue_limits_start_update(lo->lo_queue); - flush_dcache_page(bvec.bv_page); + if (mode & FALLOC_FL_ZERO_RANGE) + lim.max_write_zeroes_sectors = 0; - if (len != bvec.bv_len) { - struct bio *bio; - - __rq_for_each_bio(bio, rq) - zero_fill_bio(bio); - break; - } + if (mode & FALLOC_FL_PUNCH_HOLE) { + lim.max_hw_discard_sectors = 0; + lim.discard_granularity = 0; } - ret = 0; -out_free_page: - __free_page(page); - return ret; + /* + * XXX: this updates the queue limits without freezing the queue, which + * is against the locking protocol and dangerous. But we can't just + * freeze the queue as we're inside the ->queue_rq method here. So this + * should move out into a workqueue unless we get the file operations to + * advertise if they support specific fallocate operations. + */ + queue_limits_commit_update(lo->lo_queue, &lim); } -static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos) +static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, + int mode) { /* - * We use punch hole to reclaim the free space used by the - * image a.k.a. discard. However we do not support discard if - * encryption is enabled, because it may give an attacker - * useful information. + * We use fallocate to manipulate the space mappings used by the image + * a.k.a. discard/zerorange. */ struct file *file = lo->lo_backing_file; - int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; int ret; - if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) { - ret = -EOPNOTSUPP; - goto out; - } + mode |= FALLOC_FL_KEEP_SIZE; + + if (!bdev_max_discard_sectors(lo->lo_device)) + return -EOPNOTSUPP; ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq)); if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) - ret = -EIO; - out: + return -EIO; + + /* + * We initially configure the limits in a hope that fallocate is + * supported and clear them here if that turns out not to be true. + */ + if (unlikely(ret == -EOPNOTSUPP)) + loop_clear_limits(lo, mode); + return ret; } static int lo_req_flush(struct loop_device *lo, struct request *rq) { - struct file *file = lo->lo_backing_file; - int ret = vfs_fsync(file, 0); + int ret = vfs_fsync(lo->lo_backing_file, 0); if (unlikely(ret && ret != -EINVAL)) ret = -EIO; @@ -456,10 +286,10 @@ static void lo_complete_rq(struct request *rq) struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); blk_status_t ret = BLK_STS_OK; - if (!cmd->use_aio || cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) || + if (cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) || req_op(rq) != REQ_OP_READ) { if (cmd->ret < 0) - ret = BLK_STS_IOERR; + ret = errno_to_blk_status(cmd->ret); goto end_io; } @@ -472,14 +302,13 @@ static void lo_complete_rq(struct request *rq) cmd->ret = 0; blk_mq_requeue_request(rq, true); } else { - if (cmd->use_aio) { - struct bio *bio = rq->bio; + struct bio *bio = rq->bio; - while (bio) { - zero_fill_bio(bio); - bio = bio->bi_next; - } + while (bio) { + zero_fill_bio(bio); + bio = bio->bi_next; } + ret = BLK_STS_IOERR; end_io: blk_mq_end_request(rq, ret); @@ -494,38 +323,39 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd) return; kfree(cmd->bvec); cmd->bvec = NULL; - blk_mq_complete_request(rq); + if (req_op(rq) == REQ_OP_WRITE) + kiocb_end_write(&cmd->iocb); + if (likely(!blk_should_fake_timeout(rq->q))) + blk_mq_complete_request(rq); } -static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) +static void lo_rw_aio_complete(struct kiocb *iocb, long ret) { struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); - if (cmd->css) - css_put(cmd->css); cmd->ret = ret; lo_rw_aio_do_completion(cmd); } static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, - loff_t pos, bool rw) + loff_t pos, int rw) { struct iov_iter iter; + struct req_iterator rq_iter; struct bio_vec *bvec; struct request *rq = blk_mq_rq_from_pdu(cmd); struct bio *bio = rq->bio; struct file *file = lo->lo_backing_file; + struct bio_vec tmp; unsigned int offset; - int segments = 0; + unsigned int nr_bvec; int ret; + nr_bvec = blk_rq_nr_bvec(rq); + if (rq->bio != rq->biotail) { - struct req_iterator iter; - struct bio_vec tmp; - __rq_for_each_bio(bio, rq) - segments += bio_segments(bio); - bvec = kmalloc_array(segments, sizeof(struct bio_vec), + bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), GFP_NOIO); if (!bvec) return -EIO; @@ -534,10 +364,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, /* * The bios of the request may be started from the middle of * the 'bvec' because of bio splitting, so we can't directly - * copy bio->bi_iov_vec to new bvec. The rq_for_each_segment + * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec * API will take care of all details for us. */ - rq_for_each_segment(tmp, rq, iter) { + rq_for_each_bvec(tmp, rq, rq_iter) { *bvec = tmp; bvec++; } @@ -551,32 +381,34 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, */ offset = bio->bi_iter.bi_bvec_done; bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); - segments = bio_segments(bio); } atomic_set(&cmd->ref, 2); - iov_iter_bvec(&iter, rw, bvec, segments, blk_rq_bytes(rq)); + iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq)); iter.iov_offset = offset; cmd->iocb.ki_pos = pos; cmd->iocb.ki_filp = file; - cmd->iocb.ki_complete = lo_rw_aio_complete; - cmd->iocb.ki_flags = IOCB_DIRECT; - cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); - if (cmd->css) - kthread_associate_blkcg(cmd->css); - - if (rw == WRITE) - ret = call_write_iter(file, &cmd->iocb, &iter); - else - ret = call_read_iter(file, &cmd->iocb, &iter); + cmd->iocb.ki_ioprio = req_get_ioprio(rq); + if (cmd->use_aio) { + cmd->iocb.ki_complete = lo_rw_aio_complete; + cmd->iocb.ki_flags = IOCB_DIRECT; + } else { + cmd->iocb.ki_complete = NULL; + cmd->iocb.ki_flags = 0; + } + + if (rw == ITER_SOURCE) { + kiocb_start_write(&cmd->iocb); + ret = file->f_op->write_iter(&cmd->iocb, &iter); + } else + ret = file->f_op->read_iter(&cmd->iocb, &iter); lo_rw_aio_do_completion(cmd); - kthread_associate_blkcg(NULL); if (ret != -EIOCBQUEUED) - cmd->iocb.ki_complete(&cmd->iocb, ret, 0); - return 0; + lo_rw_aio_complete(&cmd->iocb, ret); + return -EIOCBQUEUED; } static int do_req_filebacked(struct loop_device *lo, struct request *rq) @@ -584,63 +416,69 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; - /* - * lo_write_simple and lo_read_simple should have been covered - * by io submit style function like lo_rw_aio(), one blocker - * is that lo_read_simple() need to call flush_dcache_page after - * the page is written from kernel, and it isn't easy to handle - * this in io submit style function which submits all segments - * of the req at one time. And direct read IO doesn't need to - * run flush_dcache_page(). - */ switch (req_op(rq)) { case REQ_OP_FLUSH: return lo_req_flush(lo, rq); - case REQ_OP_DISCARD: case REQ_OP_WRITE_ZEROES: - return lo_discard(lo, rq, pos); + /* + * If the caller doesn't want deallocation, call zeroout to + * write zeroes the range. Otherwise, punch them out. + */ + return lo_fallocate(lo, rq, pos, + (rq->cmd_flags & REQ_NOUNMAP) ? + FALLOC_FL_ZERO_RANGE : + FALLOC_FL_PUNCH_HOLE); + case REQ_OP_DISCARD: + return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE); case REQ_OP_WRITE: - if (lo->transfer) - return lo_write_transfer(lo, rq, pos); - else if (cmd->use_aio) - return lo_rw_aio(lo, cmd, pos, WRITE); - else - return lo_write_simple(lo, rq, pos); + return lo_rw_aio(lo, cmd, pos, ITER_SOURCE); case REQ_OP_READ: - if (lo->transfer) - return lo_read_transfer(lo, rq, pos); - else if (cmd->use_aio) - return lo_rw_aio(lo, cmd, pos, READ); - else - return lo_read_simple(lo, rq, pos); + return lo_rw_aio(lo, cmd, pos, ITER_DEST); default: WARN_ON_ONCE(1); return -EIO; } } -static inline void loop_update_dio(struct loop_device *lo) -{ - __loop_update_dio(lo, io_is_direct(lo->lo_backing_file) | - lo->use_dio); -} - -static void loop_reread_partitions(struct loop_device *lo, - struct block_device *bdev) +static void loop_reread_partitions(struct loop_device *lo) { int rc; - rc = blkdev_reread_part(bdev); + mutex_lock(&lo->lo_disk->open_mutex); + rc = bdev_disk_changed(lo->lo_disk, false); + mutex_unlock(&lo->lo_disk->open_mutex); if (rc) pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n", __func__, lo->lo_number, lo->lo_file_name, rc); } +static unsigned int loop_query_min_dio_size(struct loop_device *lo) +{ + struct file *file = lo->lo_backing_file; + struct block_device *sb_bdev = file->f_mapping->host->i_sb->s_bdev; + struct kstat st; + + /* + * Use the minimal dio alignment of the file system if provided. + */ + if (!vfs_getattr(&file->f_path, &st, STATX_DIOALIGN, 0) && + (st.result_mask & STATX_DIOALIGN)) + return st.dio_offset_align; + + /* + * In a perfect world this wouldn't be needed, but as of Linux 6.13 only + * a handful of file systems support the STATX_DIOALIGN flag. + */ + if (sb_bdev) + return bdev_logical_block_size(sb_bdev); + return SECTOR_SIZE; +} + static inline int is_loop_device(struct file *file) { struct inode *i = file->f_mapping->host; - return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR; + return i && S_ISBLK(i->i_mode) && imajor(i) == LOOP_MAJOR; } static int loop_validate_file(struct file *file, struct block_device *bdev) @@ -652,13 +490,15 @@ static int loop_validate_file(struct file *file, struct block_device *bdev) while (is_loop_device(f)) { struct loop_device *l; - if (f->f_mapping->host->i_bdev == bdev) + lockdep_assert_held(&loop_validate_mutex); + if (f->f_mapping->host->i_rdev == bdev->bd_dev) return -EBADF; - l = f->f_mapping->host->i_bdev->bd_disk->private_data; - if (l->lo_state == Lo_unbound) { + l = I_BDEV(f->f_mapping->host)->bd_disk->private_data; + if (l->lo_state != Lo_bound) return -EINVAL; - } + /* Order wrt setting lo->lo_backing_file in loop_configure(). */ + rmb(); f = l->lo_backing_file; } if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) @@ -666,6 +506,28 @@ static int loop_validate_file(struct file *file, struct block_device *bdev) return 0; } +static void loop_assign_backing_file(struct loop_device *lo, struct file *file) +{ + lo->lo_backing_file = file; + lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping); + mapping_set_gfp_mask(file->f_mapping, + lo->old_gfp_mask & ~(__GFP_IO | __GFP_FS)); + if (lo->lo_backing_file->f_flags & O_DIRECT) + lo->lo_flags |= LO_FLAGS_DIRECT_IO; + lo->lo_min_dio_size = loop_query_min_dio_size(lo); +} + +static int loop_check_backing_file(struct file *file) +{ + if (!file->f_op->read_iter) + return -EINVAL; + + if ((file->f_mode & FMODE_WRITE) && !file->f_op->write_iter) + return -EINVAL; + + return 0; +} + /* * loop_change_fd switched the backing store of a loopback device to * a new file. This is useful for operating system installers to free up @@ -677,13 +539,29 @@ static int loop_validate_file(struct file *file, struct block_device *bdev) static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, unsigned int arg) { - struct file *file = NULL, *old_file; - int error; - bool partscan; + struct file *file = fget(arg); + struct file *old_file; + unsigned int memflags; + int error; + bool partscan; + bool is_loop; - error = mutex_lock_killable(&loop_ctl_mutex); - if (error) + if (!file) + return -EBADF; + + error = loop_check_backing_file(file); + if (error) { + fput(file); return error; + } + + /* suppress uevents while reconfiguring the device */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); + + is_loop = is_loop_device(file); + error = loop_global_lock_killable(lo, is_loop); + if (error) + goto out_putf; error = -ENXIO; if (lo->lo_state != Lo_bound) goto out_err; @@ -693,11 +571,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, if (!(lo->lo_flags & LO_FLAGS_READ_ONLY)) goto out_err; - error = -EBADF; - file = fget(arg); - if (!file) - goto out_err; - error = loop_validate_file(file, bdev); if (error) goto out_err; @@ -707,35 +580,55 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, error = -EINVAL; /* size of the new backing store needs to be the same */ - if (get_loop_size(lo, file) != get_loop_size(lo, old_file)) + if (lo_calculate_size(lo, file) != lo_calculate_size(lo, old_file)) goto out_err; + /* + * We might switch to direct I/O mode for the loop device, write back + * all dirty data the page cache now that so that the individual I/O + * operations don't have to do that. + */ + vfs_fsync(file, 0); + /* and ... switch */ - blk_mq_freeze_queue(lo->lo_queue); + disk_force_media_change(lo->lo_disk); + memflags = blk_mq_freeze_queue(lo->lo_queue); mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); - lo->lo_backing_file = file; - lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping); - mapping_set_gfp_mask(file->f_mapping, - lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); + loop_assign_backing_file(lo, file); loop_update_dio(lo); - blk_mq_unfreeze_queue(lo->lo_queue); + blk_mq_unfreeze_queue(lo->lo_queue, memflags); partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; - mutex_unlock(&loop_ctl_mutex); + loop_global_unlock(lo, is_loop); + /* - * We must drop file reference outside of loop_ctl_mutex as dropping - * the file ref can take bd_mutex which creates circular locking + * Flush loop_validate_file() before fput(), for l->lo_backing_file + * might be pointing at old_file which might be the last reference. + */ + if (!is_loop) { + mutex_lock(&loop_validate_mutex); + mutex_unlock(&loop_validate_mutex); + } + /* + * We must drop file reference outside of lo_mutex as dropping + * the file ref can take open_mutex which creates circular locking * dependency. */ fput(old_file); + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); if (partscan) - loop_reread_partitions(lo, bdev); - return 0; + loop_reread_partitions(lo); -out_err: - mutex_unlock(&loop_ctl_mutex); - if (file) - fput(file); + error = 0; +done: + kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); return error; + +out_err: + loop_global_unlock(lo, is_loop); +out_putf: + fput(file); + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); + goto done; } /* loop sysfs attributes */ @@ -783,33 +676,33 @@ static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf) static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf) { - return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset); + return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_offset); } static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf) { - return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); + return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); } static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf) { int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR); - return sprintf(buf, "%s\n", autoclear ? "1" : "0"); + return sysfs_emit(buf, "%s\n", autoclear ? "1" : "0"); } static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf) { int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN); - return sprintf(buf, "%s\n", partscan ? "1" : "0"); + return sysfs_emit(buf, "%s\n", partscan ? "1" : "0"); } static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf) { int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO); - return sprintf(buf, "%s\n", dio ? "1" : "0"); + return sysfs_emit(buf, "%s\n", dio ? "1" : "0"); } LOOP_ATTR_RO(backing_file); @@ -847,81 +740,283 @@ static void loop_sysfs_exit(struct loop_device *lo) &loop_attribute_group); } -static void loop_config_discard(struct loop_device *lo) +static void loop_get_discard_config(struct loop_device *lo, + u32 *granularity, u32 *max_discard_sectors) { struct file *file = lo->lo_backing_file; struct inode *inode = file->f_mapping->host; - struct request_queue *q = lo->lo_queue; + struct kstatfs sbuf; + + /* + * If the backing device is a block device, mirror its zeroing + * capability. Set the discard sectors to the block device's zeroing + * capabilities because loop discards result in blkdev_issue_zeroout(), + * not blkdev_issue_discard(). This maintains consistent behavior with + * file-backed loop devices: discarded regions read back as zero. + */ + if (S_ISBLK(inode->i_mode)) { + struct block_device *bdev = I_BDEV(inode); + + *max_discard_sectors = bdev_write_zeroes_sectors(bdev); + *granularity = bdev_discard_granularity(bdev); /* * We use punch hole to reclaim the free space used by the - * image a.k.a. discard. However we do not support discard if - * encryption is enabled, because it may give an attacker - * useful information. + * image a.k.a. discard. */ - if ((!file->f_op->fallocate) || - lo->lo_encrypt_key_size) { - q->limits.discard_granularity = 0; - q->limits.discard_alignment = 0; - blk_queue_max_discard_sectors(q, 0); - blk_queue_max_write_zeroes_sectors(q, 0); - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q); - return; + } else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) { + *max_discard_sectors = UINT_MAX >> 9; + *granularity = sbuf.f_bsize; } +} + +struct loop_worker { + struct rb_node rb_node; + struct work_struct work; + struct list_head cmd_list; + struct list_head idle_list; + struct loop_device *lo; + struct cgroup_subsys_state *blkcg_css; + unsigned long last_ran_at; +}; - q->limits.discard_granularity = inode->i_sb->s_blocksize; - q->limits.discard_alignment = 0; +static void loop_workfn(struct work_struct *work); - blk_queue_max_discard_sectors(q, UINT_MAX >> 9); - blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); +#ifdef CONFIG_BLK_CGROUP +static inline int queue_on_root_worker(struct cgroup_subsys_state *css) +{ + return !css || css == blkcg_root_css; } +#else +static inline int queue_on_root_worker(struct cgroup_subsys_state *css) +{ + return !css; +} +#endif -static void loop_unprepare_queue(struct loop_device *lo) +static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) { - kthread_flush_worker(&lo->worker); - kthread_stop(lo->worker_task); + struct rb_node **node, *parent = NULL; + struct loop_worker *cur_worker, *worker = NULL; + struct work_struct *work; + struct list_head *cmd_list; + + spin_lock_irq(&lo->lo_work_lock); + + if (queue_on_root_worker(cmd->blkcg_css)) + goto queue_work; + + node = &lo->worker_tree.rb_node; + + while (*node) { + parent = *node; + cur_worker = container_of(*node, struct loop_worker, rb_node); + if (cur_worker->blkcg_css == cmd->blkcg_css) { + worker = cur_worker; + break; + } else if ((long)cur_worker->blkcg_css < (long)cmd->blkcg_css) { + node = &(*node)->rb_left; + } else { + node = &(*node)->rb_right; + } + } + if (worker) + goto queue_work; + + worker = kzalloc(sizeof(struct loop_worker), GFP_NOWAIT); + /* + * In the event we cannot allocate a worker, just queue on the + * rootcg worker and issue the I/O as the rootcg + */ + if (!worker) { + cmd->blkcg_css = NULL; + if (cmd->memcg_css) + css_put(cmd->memcg_css); + cmd->memcg_css = NULL; + goto queue_work; + } + + worker->blkcg_css = cmd->blkcg_css; + css_get(worker->blkcg_css); + INIT_WORK(&worker->work, loop_workfn); + INIT_LIST_HEAD(&worker->cmd_list); + INIT_LIST_HEAD(&worker->idle_list); + worker->lo = lo; + rb_link_node(&worker->rb_node, parent, node); + rb_insert_color(&worker->rb_node, &lo->worker_tree); +queue_work: + if (worker) { + /* + * We need to remove from the idle list here while + * holding the lock so that the idle timer doesn't + * free the worker + */ + if (!list_empty(&worker->idle_list)) + list_del_init(&worker->idle_list); + work = &worker->work; + cmd_list = &worker->cmd_list; + } else { + work = &lo->rootcg_work; + cmd_list = &lo->rootcg_cmd_list; + } + list_add_tail(&cmd->list_entry, cmd_list); + queue_work(lo->workqueue, work); + spin_unlock_irq(&lo->lo_work_lock); +} + +static void loop_set_timer(struct loop_device *lo) +{ + timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT); +} + +static void loop_free_idle_workers(struct loop_device *lo, bool delete_all) +{ + struct loop_worker *pos, *worker; + + spin_lock_irq(&lo->lo_work_lock); + list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, + idle_list) { + if (!delete_all && + time_is_after_jiffies(worker->last_ran_at + + LOOP_IDLE_WORKER_TIMEOUT)) + break; + list_del(&worker->idle_list); + rb_erase(&worker->rb_node, &lo->worker_tree); + css_put(worker->blkcg_css); + kfree(worker); + } + if (!list_empty(&lo->idle_worker_list)) + loop_set_timer(lo); + spin_unlock_irq(&lo->lo_work_lock); } -static int loop_kthread_worker_fn(void *worker_ptr) +static void loop_free_idle_workers_timer(struct timer_list *timer) { - current->flags |= PF_LESS_THROTTLE; - return kthread_worker_fn(worker_ptr); + struct loop_device *lo = container_of(timer, struct loop_device, timer); + + return loop_free_idle_workers(lo, false); } -static int loop_prepare_queue(struct loop_device *lo) +/** + * loop_set_status_from_info - configure device from loop_info + * @lo: struct loop_device to configure + * @info: struct loop_info64 to configure the device with + * + * Configures the loop device parameters according to the passed + * in loop_info64 configuration. + */ +static int +loop_set_status_from_info(struct loop_device *lo, + const struct loop_info64 *info) { - kthread_init_worker(&lo->worker); - lo->worker_task = kthread_run(loop_kthread_worker_fn, - &lo->worker, "loop%d", lo->lo_number); - if (IS_ERR(lo->worker_task)) - return -ENOMEM; - set_user_nice(lo->worker_task, MIN_NICE); + if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) + return -EINVAL; + + switch (info->lo_encrypt_type) { + case LO_CRYPT_NONE: + break; + case LO_CRYPT_XOR: + pr_warn("support for the xor transformation has been removed.\n"); + return -EINVAL; + case LO_CRYPT_CRYPTOAPI: + pr_warn("support for cryptoloop has been removed. Use dm-crypt instead.\n"); + return -EINVAL; + default: + return -EINVAL; + } + + /* Avoid assigning overflow values */ + if (info->lo_offset > LLONG_MAX || info->lo_sizelimit > LLONG_MAX) + return -EOVERFLOW; + + lo->lo_offset = info->lo_offset; + lo->lo_sizelimit = info->lo_sizelimit; + + memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); + lo->lo_file_name[LO_NAME_SIZE-1] = 0; return 0; } -static int loop_set_fd(struct loop_device *lo, fmode_t mode, - struct block_device *bdev, unsigned int arg) +static unsigned int loop_default_blocksize(struct loop_device *lo) +{ + /* In case of direct I/O, match underlying minimum I/O size */ + if (lo->lo_flags & LO_FLAGS_DIRECT_IO) + return lo->lo_min_dio_size; + return SECTOR_SIZE; +} + +static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim, + unsigned int bsize) { - struct file *file; - struct inode *inode; - struct address_space *mapping; - int lo_flags = 0; - int error; - loff_t size; - bool partscan; + struct file *file = lo->lo_backing_file; + struct inode *inode = file->f_mapping->host; + struct block_device *backing_bdev = NULL; + u32 granularity = 0, max_discard_sectors = 0; + + if (S_ISBLK(inode->i_mode)) + backing_bdev = I_BDEV(inode); + else if (inode->i_sb->s_bdev) + backing_bdev = inode->i_sb->s_bdev; + + if (!bsize) + bsize = loop_default_blocksize(lo); + + loop_get_discard_config(lo, &granularity, &max_discard_sectors); + + lim->logical_block_size = bsize; + lim->physical_block_size = bsize; + lim->io_min = bsize; + lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL); + if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY)) + lim->features |= BLK_FEAT_WRITE_CACHE; + if (backing_bdev && !bdev_nonrot(backing_bdev)) + lim->features |= BLK_FEAT_ROTATIONAL; + lim->max_hw_discard_sectors = max_discard_sectors; + lim->max_write_zeroes_sectors = max_discard_sectors; + if (max_discard_sectors) + lim->discard_granularity = granularity; + else + lim->discard_granularity = 0; +} + +static int loop_configure(struct loop_device *lo, blk_mode_t mode, + struct block_device *bdev, + const struct loop_config *config) +{ + struct file *file = fget(config->fd); + struct queue_limits lim; + int error; + loff_t size; + bool partscan; + bool is_loop; + + if (!file) + return -EBADF; + + error = loop_check_backing_file(file); + if (error) { + fput(file); + return error; + } + + is_loop = is_loop_device(file); /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); - error = -EBADF; - file = fget(arg); - if (!file) - goto out; + /* + * If we don't hold exclusive handle for the device, upgrade to it + * here to avoid changing device under exclusive owner. + */ + if (!(mode & BLK_OPEN_EXCL)) { + error = bd_prepare_to_claim(bdev, loop_configure, NULL); + if (error) + goto out_putf; + } - error = mutex_lock_killable(&loop_ctl_mutex); + error = loop_global_lock_killable(lo, is_loop); if (error) - goto out_putf; + goto out_bdev; error = -EBUSY; if (lo->lo_state != Lo_unbound) @@ -931,355 +1026,255 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, if (error) goto out_unlock; - mapping = file->f_mapping; - inode = mapping->host; - - if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) || - !file->f_op->write_iter) - lo_flags |= LO_FLAGS_READ_ONLY; - - error = -EFBIG; - size = get_loop_size(lo, file); - if ((loff_t)(sector_t)size != size) + if ((config->info.lo_flags & ~LOOP_CONFIGURE_SETTABLE_FLAGS) != 0) { + error = -EINVAL; goto out_unlock; - error = loop_prepare_queue(lo); + } + + error = loop_set_status_from_info(lo, &config->info); if (error) goto out_unlock; + lo->lo_flags = config->info.lo_flags; - error = 0; + if (!(file->f_mode & FMODE_WRITE) || !(mode & BLK_OPEN_WRITE) || + !file->f_op->write_iter) + lo->lo_flags |= LO_FLAGS_READ_ONLY; + + if (!lo->workqueue) { + lo->workqueue = alloc_workqueue("loop%d", + WQ_UNBOUND | WQ_FREEZABLE, + 0, lo->lo_number); + if (!lo->workqueue) { + error = -ENOMEM; + goto out_unlock; + } + } - set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); + /* suppress uevents while reconfiguring the device */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); + + disk_force_media_change(lo->lo_disk); + set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); - lo->use_dio = false; lo->lo_device = bdev; - lo->lo_flags = lo_flags; - lo->lo_backing_file = file; - lo->transfer = NULL; - lo->ioctl = NULL; - lo->lo_sizelimit = 0; - lo->old_gfp_mask = mapping_gfp_mask(mapping); - mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); + loop_assign_backing_file(lo, file); - if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) - blk_queue_write_cache(lo->lo_queue, true, false); + lim = queue_limits_start_update(lo->lo_queue); + loop_update_limits(lo, &lim, config->block_size); + /* No need to freeze the queue as the device isn't bound yet. */ + error = queue_limits_commit_update(lo->lo_queue, &lim); + if (error) + goto out_unlock; + + /* + * We might switch to direct I/O mode for the loop device, write back + * all dirty data the page cache now that so that the individual I/O + * operations don't have to do that. + */ + vfs_fsync(file, 0); loop_update_dio(lo); - set_capacity(lo->lo_disk, size); - bd_set_size(bdev, size << 9); loop_sysfs_init(lo); - /* let user-space know about the new size */ - kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); - set_blocksize(bdev, S_ISBLK(inode->i_mode) ? - block_size(inode->i_bdev) : PAGE_SIZE); + size = lo_calculate_size(lo, file); + loop_set_size(lo, size); + + /* Order wrt reading lo_state in loop_validate_file(). */ + wmb(); lo->lo_state = Lo_bound; if (part_shift) lo->lo_flags |= LO_FLAGS_PARTSCAN; partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; + if (partscan) + clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); - /* Grab the block_device to prevent its destruction after we - * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev). - */ - bdgrab(bdev); - mutex_unlock(&loop_ctl_mutex); + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); + kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); + + loop_global_unlock(lo, is_loop); if (partscan) - loop_reread_partitions(lo, bdev); + loop_reread_partitions(lo); + + if (!(mode & BLK_OPEN_EXCL)) + bd_abort_claiming(bdev, loop_configure); + return 0; out_unlock: - mutex_unlock(&loop_ctl_mutex); + loop_global_unlock(lo, is_loop); +out_bdev: + if (!(mode & BLK_OPEN_EXCL)) + bd_abort_claiming(bdev, loop_configure); out_putf: fput(file); -out: /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); return error; } -static int -loop_release_xfer(struct loop_device *lo) +static void __loop_clr_fd(struct loop_device *lo) { - int err = 0; - struct loop_func_table *xfer = lo->lo_encryption; - - if (xfer) { - if (xfer->release) - err = xfer->release(lo); - lo->transfer = NULL; - lo->lo_encryption = NULL; - module_put(xfer->owner); - } - return err; -} - -static int -loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer, - const struct loop_info64 *i) -{ - int err = 0; - - if (xfer) { - struct module *owner = xfer->owner; - - if (!try_module_get(owner)) - return -EINVAL; - if (xfer->init) - err = xfer->init(lo, i); - if (err) - module_put(owner); - else - lo->lo_encryption = xfer; - } - return err; -} - -static int __loop_clr_fd(struct loop_device *lo, bool release) -{ - struct file *filp = NULL; + struct queue_limits lim; + struct file *filp; gfp_t gfp = lo->old_gfp_mask; - struct block_device *bdev = lo->lo_device; - int err = 0; - bool partscan = false; - int lo_number; - - mutex_lock(&loop_ctl_mutex); - if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) { - err = -ENXIO; - goto out_unlock; - } - - filp = lo->lo_backing_file; - if (filp == NULL) { - err = -EINVAL; - goto out_unlock; - } - - /* freeze request queue during the transition */ - blk_mq_freeze_queue(lo->lo_queue); spin_lock_irq(&lo->lo_lock); + filp = lo->lo_backing_file; lo->lo_backing_file = NULL; spin_unlock_irq(&lo->lo_lock); - loop_release_xfer(lo); - lo->transfer = NULL; - lo->ioctl = NULL; lo->lo_device = NULL; - lo->lo_encryption = NULL; lo->lo_offset = 0; lo->lo_sizelimit = 0; - lo->lo_encrypt_key_size = 0; - memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); - memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); - blk_queue_logical_block_size(lo->lo_queue, 512); - blk_queue_physical_block_size(lo->lo_queue, 512); - blk_queue_io_min(lo->lo_queue, 512); - if (bdev) { - bdput(bdev); - invalidate_bdev(bdev); - bdev->bd_inode->i_mapping->wb_err = 0; - } - set_capacity(lo->lo_disk, 0); + + /* + * Reset the block size to the default. + * + * No queue freezing needed because this is called from the final + * ->release call only, so there can't be any outstanding I/O. + */ + lim = queue_limits_start_update(lo->lo_queue); + lim.logical_block_size = SECTOR_SIZE; + lim.physical_block_size = SECTOR_SIZE; + lim.io_min = SECTOR_SIZE; + queue_limits_commit_update(lo->lo_queue, &lim); + + invalidate_disk(lo->lo_disk); loop_sysfs_exit(lo); - if (bdev) { - bd_set_size(bdev, 0); - /* let user-space know about this change */ - kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); - } + /* let user-space know about this change */ + kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); mapping_set_gfp_mask(filp->f_mapping, gfp); - lo->lo_state = Lo_unbound; /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - blk_mq_unfreeze_queue(lo->lo_queue); - partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev; - lo_number = lo->lo_number; - lo->lo_flags = 0; - if (!part_shift) - lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; - loop_unprepare_queue(lo); -out_unlock: - mutex_unlock(&loop_ctl_mutex); - if (partscan) { + disk_force_media_change(lo->lo_disk); + + if (lo->lo_flags & LO_FLAGS_PARTSCAN) { + int err; + /* - * bd_mutex has been held already in release path, so don't + * open_mutex has been held already in release path, so don't * acquire it if this function is called in such case. * * If the reread partition isn't from release path, lo_refcnt * must be at least one and it can only become zero when the * current holder is released. */ - if (release) - err = __blkdev_reread_part(bdev); - else - err = blkdev_reread_part(bdev); - pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", - __func__, lo_number, err); + err = bdev_disk_changed(lo->lo_disk, false); + if (err) + pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", + __func__, lo->lo_number, err); /* Device is gone, no point in returning error */ - err = 0; } + /* - * Need not hold loop_ctl_mutex to fput backing file. - * Calling fput holding loop_ctl_mutex triggers a circular - * lock dependency possibility warning as fput can take - * bd_mutex which is usually taken before loop_ctl_mutex. + * lo->lo_state is set to Lo_unbound here after above partscan has + * finished. There cannot be anybody else entering __loop_clr_fd() as + * Lo_rundown state protects us from all the other places trying to + * change the 'lo' device. */ - if (filp) - fput(filp); - return err; + lo->lo_flags = 0; + if (!part_shift) + set_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); + mutex_lock(&lo->lo_mutex); + lo->lo_state = Lo_unbound; + mutex_unlock(&lo->lo_mutex); + + /* + * Need not hold lo_mutex to fput backing file. Calling fput holding + * lo_mutex triggers a circular lock dependency possibility warning as + * fput can take open_mutex which is usually taken before lo_mutex. + */ + fput(filp); } static int loop_clr_fd(struct loop_device *lo) { int err; - err = mutex_lock_killable(&loop_ctl_mutex); + /* + * Since lo_ioctl() is called without locks held, it is possible that + * loop_configure()/loop_change_fd() and loop_clr_fd() run in parallel. + * + * Therefore, use global lock when setting Lo_rundown state in order to + * make sure that loop_validate_file() will fail if the "struct file" + * which loop_configure()/loop_change_fd() found via fget() was this + * loop device. + */ + err = loop_global_lock_killable(lo, true); if (err) return err; if (lo->lo_state != Lo_bound) { - mutex_unlock(&loop_ctl_mutex); + loop_global_unlock(lo, true); return -ENXIO; } /* - * If we've explicitly asked to tear down the loop device, - * and it has an elevated reference count, set it for auto-teardown when - * the last reference goes away. This stops $!~#$@ udev from - * preventing teardown because it decided that it needs to run blkid on - * the loopback device whenever they appear. xfstests is notorious for - * failing tests because blkid via udev races with a losetup - * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d - * command to fail with EBUSY. + * Mark the device for removing the backing device on last close. + * If we are the only opener, also switch the state to roundown here to + * prevent new openers from coming in. */ - if (atomic_read(&lo->lo_refcnt) > 1) { - lo->lo_flags |= LO_FLAGS_AUTOCLEAR; - mutex_unlock(&loop_ctl_mutex); - return 0; - } - lo->lo_state = Lo_rundown; - mutex_unlock(&loop_ctl_mutex); - return __loop_clr_fd(lo, false); + lo->lo_flags |= LO_FLAGS_AUTOCLEAR; + if (disk_openers(lo->lo_disk) == 1) + lo->lo_state = Lo_rundown; + loop_global_unlock(lo, true); + + return 0; } static int loop_set_status(struct loop_device *lo, const struct loop_info64 *info) { int err; - struct loop_func_table *xfer; - kuid_t uid = current_uid(); - struct block_device *bdev; bool partscan = false; + bool size_changed = false; + unsigned int memflags; - err = mutex_lock_killable(&loop_ctl_mutex); + err = mutex_lock_killable(&lo->lo_mutex); if (err) return err; - if (lo->lo_encrypt_key_size && - !uid_eq(lo->lo_key_owner, uid) && - !capable(CAP_SYS_ADMIN)) { - err = -EPERM; - goto out_unlock; - } if (lo->lo_state != Lo_bound) { err = -ENXIO; goto out_unlock; } - if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) { - err = -EINVAL; - goto out_unlock; - } if (lo->lo_offset != info->lo_offset || lo->lo_sizelimit != info->lo_sizelimit) { + size_changed = true; sync_blockdev(lo->lo_device); - kill_bdev(lo->lo_device); + invalidate_bdev(lo->lo_device); } - /* I/O need to be drained during transfer transition */ - blk_mq_freeze_queue(lo->lo_queue); + /* I/O needs to be drained before changing lo_offset or lo_sizelimit */ + memflags = blk_mq_freeze_queue(lo->lo_queue); - err = loop_release_xfer(lo); + err = loop_set_status_from_info(lo, info); if (err) goto out_unfreeze; - if (info->lo_encrypt_type) { - unsigned int type = info->lo_encrypt_type; - - if (type >= MAX_LO_CRYPT) { - err = -EINVAL; - goto out_unfreeze; - } - xfer = xfer_funcs[type]; - if (xfer == NULL) { - err = -EINVAL; - goto out_unfreeze; - } - } else - xfer = NULL; - - err = loop_init_xfer(lo, xfer, info); - if (err) - goto out_unfreeze; - - if (lo->lo_offset != info->lo_offset || - lo->lo_sizelimit != info->lo_sizelimit) { - /* kill_bdev should have truncated all the pages */ - if (lo->lo_device->bd_inode->i_mapping->nrpages) { - err = -EAGAIN; - pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n", - __func__, lo->lo_number, lo->lo_file_name, - lo->lo_device->bd_inode->i_mapping->nrpages); - goto out_unfreeze; - } - if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) { - err = -EFBIG; - goto out_unfreeze; - } - } - - loop_config_discard(lo); + partscan = !(lo->lo_flags & LO_FLAGS_PARTSCAN) && + (info->lo_flags & LO_FLAGS_PARTSCAN); - memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); - memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE); - lo->lo_file_name[LO_NAME_SIZE-1] = 0; - lo->lo_crypt_name[LO_NAME_SIZE-1] = 0; - - if (!xfer) - xfer = &none_funcs; - lo->transfer = xfer->transfer; - lo->ioctl = xfer->ioctl; - - if ((lo->lo_flags & LO_FLAGS_AUTOCLEAR) != - (info->lo_flags & LO_FLAGS_AUTOCLEAR)) - lo->lo_flags ^= LO_FLAGS_AUTOCLEAR; - - lo->lo_encrypt_key_size = info->lo_encrypt_key_size; - lo->lo_init[0] = info->lo_init[0]; - lo->lo_init[1] = info->lo_init[1]; - if (info->lo_encrypt_key_size) { - memcpy(lo->lo_encrypt_key, info->lo_encrypt_key, - info->lo_encrypt_key_size); - lo->lo_key_owner = uid; - } + lo->lo_flags &= ~LOOP_SET_STATUS_CLEARABLE_FLAGS; + lo->lo_flags |= (info->lo_flags & LOOP_SET_STATUS_SETTABLE_FLAGS); - /* update dio if lo_offset or transfer is changed */ - __loop_update_dio(lo, lo->use_dio); + /* update the direct I/O flag if lo_offset changed */ + loop_update_dio(lo); out_unfreeze: - blk_mq_unfreeze_queue(lo->lo_queue); - - if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) && - !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { - lo->lo_flags |= LO_FLAGS_PARTSCAN; - lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; - bdev = lo->lo_device; - partscan = true; + blk_mq_unfreeze_queue(lo->lo_queue, memflags); + if (partscan) + clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); + if (!err && size_changed) { + loff_t new_size = lo_calculate_size(lo, lo->lo_backing_file); + loop_set_size(lo, new_size); } out_unlock: - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); if (partscan) - loop_reread_partitions(lo, bdev); + loop_reread_partitions(lo); return err; } @@ -1291,11 +1286,11 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info) struct kstat stat; int ret; - ret = mutex_lock_killable(&loop_ctl_mutex); + ret = mutex_lock_killable(&lo->lo_mutex); if (ret) return ret; if (lo->lo_state != Lo_bound) { - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); return -ENXIO; } @@ -1305,19 +1300,11 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info) info->lo_sizelimit = lo->lo_sizelimit; info->lo_flags = lo->lo_flags; memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE); - memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE); - info->lo_encrypt_type = - lo->lo_encryption ? lo->lo_encryption->number : 0; - if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) { - info->lo_encrypt_key_size = lo->lo_encrypt_key_size; - memcpy(info->lo_encrypt_key, lo->lo_encrypt_key, - lo->lo_encrypt_key_size); - } - /* Drop loop_ctl_mutex while we call into the filesystem. */ + /* Drop lo_mutex while we call into the filesystem. */ path = lo->lo_backing_file->f_path; path_get(&path); - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT); if (!ret) { info->lo_device = huge_encode_dev(stat.dev); @@ -1338,16 +1325,8 @@ loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64) info64->lo_rdevice = info->lo_rdevice; info64->lo_offset = info->lo_offset; info64->lo_sizelimit = 0; - info64->lo_encrypt_type = info->lo_encrypt_type; - info64->lo_encrypt_key_size = info->lo_encrypt_key_size; info64->lo_flags = info->lo_flags; - info64->lo_init[0] = info->lo_init[0]; - info64->lo_init[1] = info->lo_init[1]; - if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI) - memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE); - else - memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE); - memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE); + memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE); } static int @@ -1359,16 +1338,8 @@ loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info) info->lo_inode = info64->lo_inode; info->lo_rdevice = info64->lo_rdevice; info->lo_offset = info64->lo_offset; - info->lo_encrypt_type = info64->lo_encrypt_type; - info->lo_encrypt_key_size = info64->lo_encrypt_key_size; info->lo_flags = info64->lo_flags; - info->lo_init[0] = info64->lo_init[0]; - info->lo_init[1] = info64->lo_init[1]; - if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI) - memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE); - else - memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE); - memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE); + memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE); /* error in case values were truncated */ if (info->lo_device != info64->lo_device || @@ -1435,60 +1406,88 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) { static int loop_set_capacity(struct loop_device *lo) { + loff_t size; + if (unlikely(lo->lo_state != Lo_bound)) return -ENXIO; - return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit); + size = lo_calculate_size(lo, lo->lo_backing_file); + loop_set_size(lo, size); + + return 0; } static int loop_set_dio(struct loop_device *lo, unsigned long arg) { - int error = -ENXIO; - if (lo->lo_state != Lo_bound) - goto out; + bool use_dio = !!arg; + unsigned int memflags; - __loop_update_dio(lo, !!arg); - if (lo->use_dio == !!arg) + if (lo->lo_state != Lo_bound) + return -ENXIO; + if (use_dio == !!(lo->lo_flags & LO_FLAGS_DIRECT_IO)) return 0; - error = -EINVAL; - out: - return error; + + if (use_dio) { + if (!lo_can_use_dio(lo)) + return -EINVAL; + /* flush dirty pages before starting to use direct I/O */ + vfs_fsync(lo->lo_backing_file, 0); + } + + memflags = blk_mq_freeze_queue(lo->lo_queue); + if (use_dio) + lo->lo_flags |= LO_FLAGS_DIRECT_IO; + else + lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; + blk_mq_unfreeze_queue(lo->lo_queue, memflags); + return 0; } -static int loop_set_block_size(struct loop_device *lo, unsigned long arg) +static int loop_set_block_size(struct loop_device *lo, blk_mode_t mode, + struct block_device *bdev, unsigned long arg) { + struct queue_limits lim; + unsigned int memflags; int err = 0; - if (lo->lo_state != Lo_bound) - return -ENXIO; + /* + * If we don't hold exclusive handle for the device, upgrade to it + * here to avoid changing device under exclusive owner. + */ + if (!(mode & BLK_OPEN_EXCL)) { + err = bd_prepare_to_claim(bdev, loop_set_block_size, NULL); + if (err) + return err; + } - if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg)) - return -EINVAL; + err = mutex_lock_killable(&lo->lo_mutex); + if (err) + goto abort_claim; - if (lo->lo_queue->limits.logical_block_size != arg) { - sync_blockdev(lo->lo_device); - kill_bdev(lo->lo_device); + if (lo->lo_state != Lo_bound) { + err = -ENXIO; + goto unlock; } - blk_mq_freeze_queue(lo->lo_queue); + if (lo->lo_queue->limits.logical_block_size == arg) + goto unlock; - /* kill_bdev should have truncated all the pages */ - if (lo->lo_queue->limits.logical_block_size != arg && - lo->lo_device->bd_inode->i_mapping->nrpages) { - err = -EAGAIN; - pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n", - __func__, lo->lo_number, lo->lo_file_name, - lo->lo_device->bd_inode->i_mapping->nrpages); - goto out_unfreeze; - } + sync_blockdev(lo->lo_device); + invalidate_bdev(lo->lo_device); + + lim = queue_limits_start_update(lo->lo_queue); + loop_update_limits(lo, &lim, arg); - blk_queue_logical_block_size(lo->lo_queue, arg); - blk_queue_physical_block_size(lo->lo_queue, arg); - blk_queue_io_min(lo->lo_queue, arg); + memflags = blk_mq_freeze_queue(lo->lo_queue); + err = queue_limits_commit_update(lo->lo_queue, &lim); loop_update_dio(lo); -out_unfreeze: - blk_mq_unfreeze_queue(lo->lo_queue); + blk_mq_unfreeze_queue(lo->lo_queue, memflags); +unlock: + mutex_unlock(&lo->lo_mutex); +abort_claim: + if (!(mode & BLK_OPEN_EXCL)) + bd_abort_claiming(bdev, loop_set_block_size); return err; } @@ -1497,7 +1496,7 @@ static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd, { int err; - err = mutex_lock_killable(&loop_ctl_mutex); + err = mutex_lock_killable(&lo->lo_mutex); if (err) return err; switch (cmd) { @@ -1507,53 +1506,69 @@ static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd, case LOOP_SET_DIRECT_IO: err = loop_set_dio(lo, arg); break; - case LOOP_SET_BLOCK_SIZE: - err = loop_set_block_size(lo, arg); - break; default: - err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL; + err = -EINVAL; } - mutex_unlock(&loop_ctl_mutex); + mutex_unlock(&lo->lo_mutex); return err; } -static int lo_ioctl(struct block_device *bdev, fmode_t mode, +static int lo_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct loop_device *lo = bdev->bd_disk->private_data; + void __user *argp = (void __user *) arg; int err; switch (cmd) { - case LOOP_SET_FD: - return loop_set_fd(lo, mode, bdev, arg); + case LOOP_SET_FD: { + /* + * Legacy case - pass in a zeroed out struct loop_config with + * only the file descriptor set , which corresponds with the + * default parameters we'd have used otherwise. + */ + struct loop_config config; + + memset(&config, 0, sizeof(config)); + config.fd = arg; + + return loop_configure(lo, mode, bdev, &config); + } + case LOOP_CONFIGURE: { + struct loop_config config; + + if (copy_from_user(&config, argp, sizeof(config))) + return -EFAULT; + + return loop_configure(lo, mode, bdev, &config); + } case LOOP_CHANGE_FD: return loop_change_fd(lo, bdev, arg); case LOOP_CLR_FD: return loop_clr_fd(lo); case LOOP_SET_STATUS: err = -EPERM; - if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) { - err = loop_set_status_old(lo, - (struct loop_info __user *)arg); - } + if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN)) + err = loop_set_status_old(lo, argp); break; case LOOP_GET_STATUS: - return loop_get_status_old(lo, (struct loop_info __user *) arg); + return loop_get_status_old(lo, argp); case LOOP_SET_STATUS64: err = -EPERM; - if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) { - err = loop_set_status64(lo, - (struct loop_info64 __user *) arg); - } + if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN)) + err = loop_set_status64(lo, argp); break; case LOOP_GET_STATUS64: - return loop_get_status64(lo, (struct loop_info64 __user *) arg); + return loop_get_status64(lo, argp); + case LOOP_SET_BLOCK_SIZE: + if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + return loop_set_block_size(lo, mode, bdev, arg); case LOOP_SET_CAPACITY: case LOOP_SET_DIRECT_IO: - case LOOP_SET_BLOCK_SIZE: - if (!(mode & FMODE_WRITE) && !capable(CAP_SYS_ADMIN)) + if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN)) return -EPERM; - /* Fall through */ + fallthrough; default: err = lo_simple_ioctl(lo, cmd, arg); break; @@ -1569,7 +1584,7 @@ struct compat_loop_info { compat_ulong_t lo_inode; /* ioctl r/o */ compat_dev_t lo_rdevice; /* ioctl r/o */ compat_int_t lo_offset; - compat_int_t lo_encrypt_type; + compat_int_t lo_encrypt_type; /* obsolete, ignored */ compat_int_t lo_encrypt_key_size; /* ioctl w/o */ compat_int_t lo_flags; /* ioctl r/o */ char lo_name[LO_NAME_SIZE]; @@ -1598,16 +1613,8 @@ loop_info64_from_compat(const struct compat_loop_info __user *arg, info64->lo_rdevice = info.lo_rdevice; info64->lo_offset = info.lo_offset; info64->lo_sizelimit = 0; - info64->lo_encrypt_type = info.lo_encrypt_type; - info64->lo_encrypt_key_size = info.lo_encrypt_key_size; info64->lo_flags = info.lo_flags; - info64->lo_init[0] = info.lo_init[0]; - info64->lo_init[1] = info.lo_init[1]; - if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI) - memcpy(info64->lo_crypt_name, info.lo_name, LO_NAME_SIZE); - else - memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE); - memcpy(info64->lo_encrypt_key, info.lo_encrypt_key, LO_KEY_SIZE); + memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE); return 0; } @@ -1627,24 +1634,14 @@ loop_info64_to_compat(const struct loop_info64 *info64, info.lo_inode = info64->lo_inode; info.lo_rdevice = info64->lo_rdevice; info.lo_offset = info64->lo_offset; - info.lo_encrypt_type = info64->lo_encrypt_type; - info.lo_encrypt_key_size = info64->lo_encrypt_key_size; info.lo_flags = info64->lo_flags; - info.lo_init[0] = info64->lo_init[0]; - info.lo_init[1] = info64->lo_init[1]; - if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI) - memcpy(info.lo_name, info64->lo_crypt_name, LO_NAME_SIZE); - else - memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE); - memcpy(info.lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE); + memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE); /* error in case values were truncated */ if (info.lo_device != info64->lo_device || info.lo_rdevice != info64->lo_rdevice || info.lo_inode != info64->lo_inode || - info.lo_offset != info64->lo_offset || - info.lo_init[0] != info64->lo_init[0] || - info.lo_init[1] != info64->lo_init[1]) + info.lo_offset != info64->lo_offset) return -EOVERFLOW; if (copy_to_user(arg, &info, sizeof(info))) @@ -1680,7 +1677,7 @@ loop_get_status_compat(struct loop_device *lo, return err; } -static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, +static int lo_compat_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct loop_device *lo = bdev->bd_disk->private_data; @@ -1699,11 +1696,13 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, case LOOP_CLR_FD: case LOOP_GET_STATUS64: case LOOP_SET_STATUS64: + case LOOP_CONFIGURE: arg = (unsigned long) compat_ptr(arg); - /* fall through */ + fallthrough; case LOOP_SET_FD: case LOOP_CHANGE_FD: case LOOP_SET_BLOCK_SIZE: + case LOOP_SET_DIRECT_IO: err = lo_ioctl(bdev, mode, cmd, arg); break; default: @@ -1714,117 +1713,141 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, } #endif -static int lo_open(struct block_device *bdev, fmode_t mode) +static int lo_open(struct gendisk *disk, blk_mode_t mode) { - struct loop_device *lo; + struct loop_device *lo = disk->private_data; int err; - err = mutex_lock_killable(&loop_ctl_mutex); + err = mutex_lock_killable(&lo->lo_mutex); if (err) return err; - lo = bdev->bd_disk->private_data; - if (!lo) { - err = -ENXIO; - goto out; - } - atomic_inc(&lo->lo_refcnt); -out: - mutex_unlock(&loop_ctl_mutex); + if (lo->lo_state == Lo_deleting || lo->lo_state == Lo_rundown) + err = -ENXIO; + mutex_unlock(&lo->lo_mutex); return err; } -static void lo_release(struct gendisk *disk, fmode_t mode) +static void lo_release(struct gendisk *disk) { - struct loop_device *lo; + struct loop_device *lo = disk->private_data; + bool need_clear = false; - mutex_lock(&loop_ctl_mutex); - lo = disk->private_data; - if (atomic_dec_return(&lo->lo_refcnt)) - goto out_unlock; + if (disk_openers(disk) > 0) + return; + /* + * Clear the backing device information if this is the last close of + * a device that's been marked for auto clear, or on which LOOP_CLR_FD + * has been called. + */ - if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) { - if (lo->lo_state != Lo_bound) - goto out_unlock; + mutex_lock(&lo->lo_mutex); + if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) lo->lo_state = Lo_rundown; - mutex_unlock(&loop_ctl_mutex); - /* - * In autoclear mode, stop the loop thread - * and remove configuration after last close. - */ - __loop_clr_fd(lo, true); - return; - } else if (lo->lo_state == Lo_bound) { - /* - * Otherwise keep thread (if running) and config, - * but flush possible ongoing bios in thread. - */ - blk_mq_freeze_queue(lo->lo_queue); - blk_mq_unfreeze_queue(lo->lo_queue); - } -out_unlock: - mutex_unlock(&loop_ctl_mutex); + need_clear = (lo->lo_state == Lo_rundown); + mutex_unlock(&lo->lo_mutex); + + if (need_clear) + __loop_clr_fd(lo); +} + +static void lo_free_disk(struct gendisk *disk) +{ + struct loop_device *lo = disk->private_data; + + if (lo->workqueue) + destroy_workqueue(lo->workqueue); + loop_free_idle_workers(lo, true); + timer_shutdown_sync(&lo->timer); + mutex_destroy(&lo->lo_mutex); + kfree(lo); } static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, - .open = lo_open, + .open = lo_open, .release = lo_release, .ioctl = lo_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = lo_compat_ioctl, #endif + .free_disk = lo_free_disk, }; /* * And now the modules code and kernel interface. */ -static int max_loop; -module_param(max_loop, int, 0444); -MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); -module_param(max_part, int, 0444); -MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); -int loop_register_transfer(struct loop_func_table *funcs) -{ - unsigned int n = funcs->number; +/* + * If max_loop is specified, create that many devices upfront. + * This also becomes a hard limit. If max_loop is not specified, + * the default isn't a hard limit (as before commit 85c50197716c + * changed the default value from 0 for max_loop=0 reasons), just + * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module + * init time. Loop devices can be requested on-demand with the + * /dev/loop-control interface, or be instantiated by accessing + * a 'dead' device node. + */ +static int max_loop = CONFIG_BLK_DEV_LOOP_MIN_COUNT; - if (n >= MAX_LO_CRYPT || xfer_funcs[n]) - return -EINVAL; - xfer_funcs[n] = funcs; - return 0; -} +#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD +static bool max_loop_specified; -static int unregister_transfer_cb(int id, void *ptr, void *data) +static int max_loop_param_set_int(const char *val, + const struct kernel_param *kp) { - struct loop_device *lo = ptr; - struct loop_func_table *xfer = data; + int ret; - mutex_lock(&loop_ctl_mutex); - if (lo->lo_encryption == xfer) - loop_release_xfer(lo); - mutex_unlock(&loop_ctl_mutex); + ret = param_set_int(val, kp); + if (ret < 0) + return ret; + + max_loop_specified = true; return 0; } -int loop_unregister_transfer(int number) +static const struct kernel_param_ops max_loop_param_ops = { + .set = max_loop_param_set_int, + .get = param_get_int, +}; + +module_param_cb(max_loop, &max_loop_param_ops, &max_loop, 0444); +MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); +#else +module_param(max_loop, int, 0444); +MODULE_PARM_DESC(max_loop, "Initial number of loop devices"); +#endif + +module_param(max_part, int, 0444); +MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); + +static int hw_queue_depth = LOOP_DEFAULT_HW_Q_DEPTH; + +static int loop_set_hw_queue_depth(const char *s, const struct kernel_param *p) { - unsigned int n = number; - struct loop_func_table *xfer; + int qd, ret; - if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL) + ret = kstrtoint(s, 0, &qd); + if (ret < 0) + return ret; + if (qd < 1) return -EINVAL; - - xfer_funcs[n] = NULL; - idr_for_each(&loop_index_idr, &unregister_transfer_cb, xfer); + hw_queue_depth = qd; return 0; } -EXPORT_SYMBOL(loop_register_transfer); -EXPORT_SYMBOL(loop_unregister_transfer); +static const struct kernel_param_ops loop_hw_qdepth_param_ops = { + .set = loop_set_hw_queue_depth, + .get = param_get_int, +}; + +device_param_cb(hw_queue_depth, &loop_hw_qdepth_param_ops, &hw_queue_depth, 0444); +MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: " __stringify(LOOP_DEFAULT_HW_Q_DEPTH)); + +MODULE_DESCRIPTION("Loopback device support"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) @@ -1845,69 +1868,143 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, cmd->use_aio = false; break; default: - cmd->use_aio = lo->use_dio; + cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO; break; } /* always use the first bio's css */ + cmd->blkcg_css = NULL; + cmd->memcg_css = NULL; #ifdef CONFIG_BLK_CGROUP - if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) { - cmd->css = &bio_blkcg(rq->bio)->css; - css_get(cmd->css); - } else + if (rq->bio) { + cmd->blkcg_css = bio_blkcg_css(rq->bio); +#ifdef CONFIG_MEMCG + if (cmd->blkcg_css) { + cmd->memcg_css = + cgroup_get_e_css(cmd->blkcg_css->cgroup, + &memory_cgrp_subsys); + } #endif - cmd->css = NULL; - kthread_queue_work(&lo->worker, &cmd->work); + } +#endif + loop_queue_work(lo, cmd); return BLK_STS_OK; } static void loop_handle_cmd(struct loop_cmd *cmd) { + struct cgroup_subsys_state *cmd_blkcg_css = cmd->blkcg_css; + struct cgroup_subsys_state *cmd_memcg_css = cmd->memcg_css; struct request *rq = blk_mq_rq_from_pdu(cmd); const bool write = op_is_write(req_op(rq)); struct loop_device *lo = rq->q->queuedata; int ret = 0; + struct mem_cgroup *old_memcg = NULL; if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) { ret = -EIO; goto failed; } + /* We can block in this context, so ignore REQ_NOWAIT. */ + if (rq->cmd_flags & REQ_NOWAIT) + rq->cmd_flags &= ~REQ_NOWAIT; + + if (cmd_blkcg_css) + kthread_associate_blkcg(cmd_blkcg_css); + if (cmd_memcg_css) + old_memcg = set_active_memcg( + mem_cgroup_from_css(cmd_memcg_css)); + + /* + * do_req_filebacked() may call blk_mq_complete_request() synchronously + * or asynchronously if using aio. Hence, do not touch 'cmd' after + * do_req_filebacked() has returned unless we are sure that 'cmd' has + * not yet been completed. + */ ret = do_req_filebacked(lo, rq); + + if (cmd_blkcg_css) + kthread_associate_blkcg(NULL); + + if (cmd_memcg_css) { + set_active_memcg(old_memcg); + css_put(cmd_memcg_css); + } failed: /* complete non-aio request */ - if (!cmd->use_aio || ret) { - cmd->ret = ret ? -EIO : 0; - blk_mq_complete_request(rq); + if (ret != -EIOCBQUEUED) { + if (ret == -EOPNOTSUPP) + cmd->ret = ret; + else + cmd->ret = ret ? -EIO : 0; + if (likely(!blk_should_fake_timeout(rq->q))) + blk_mq_complete_request(rq); } } -static void loop_queue_work(struct kthread_work *work) +static void loop_process_work(struct loop_worker *worker, + struct list_head *cmd_list, struct loop_device *lo) { - struct loop_cmd *cmd = - container_of(work, struct loop_cmd, work); + int orig_flags = current->flags; + struct loop_cmd *cmd; - loop_handle_cmd(cmd); + current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; + spin_lock_irq(&lo->lo_work_lock); + while (!list_empty(cmd_list)) { + cmd = container_of( + cmd_list->next, struct loop_cmd, list_entry); + list_del(cmd_list->next); + spin_unlock_irq(&lo->lo_work_lock); + + loop_handle_cmd(cmd); + cond_resched(); + + spin_lock_irq(&lo->lo_work_lock); + } + + /* + * We only add to the idle list if there are no pending cmds + * *and* the worker will not run again which ensures that it + * is safe to free any worker on the idle list + */ + if (worker && !work_pending(&worker->work)) { + worker->last_ran_at = jiffies; + list_add_tail(&worker->idle_list, &lo->idle_worker_list); + loop_set_timer(lo); + } + spin_unlock_irq(&lo->lo_work_lock); + current->flags = orig_flags; } -static int loop_init_request(struct blk_mq_tag_set *set, struct request *rq, - unsigned int hctx_idx, unsigned int numa_node) +static void loop_workfn(struct work_struct *work) { - struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); + struct loop_worker *worker = + container_of(work, struct loop_worker, work); + loop_process_work(worker, &worker->cmd_list, worker->lo); +} - kthread_init_work(&cmd->work, loop_queue_work); - return 0; +static void loop_rootcg_workfn(struct work_struct *work) +{ + struct loop_device *lo = + container_of(work, struct loop_device, rootcg_work); + loop_process_work(NULL, &lo->rootcg_cmd_list, lo); } static const struct blk_mq_ops loop_mq_ops = { .queue_rq = loop_queue_rq, - .init_request = loop_init_request, .complete = lo_complete_rq, }; -static int loop_add(struct loop_device **l, int i) +static int loop_add(int i) { + struct queue_limits lim = { + /* + * Random number picked from the historic block max_sectors cap. + */ + .max_hw_sectors = 2560u, + }; struct loop_device *lo; struct gendisk *disk; int err; @@ -1916,9 +2013,15 @@ static int loop_add(struct loop_device **l, int i) lo = kzalloc(sizeof(*lo), GFP_KERNEL); if (!lo) goto out; - + lo->worker_tree = RB_ROOT; + INIT_LIST_HEAD(&lo->idle_worker_list); + timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE); lo->lo_state = Lo_unbound; + err = mutex_lock_killable(&loop_ctl_mutex); + if (err) + goto out_free_dev; + /* allocate id, if @id >= 0, we're requesting that specific id */ if (i >= 0) { err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL); @@ -1927,44 +2030,29 @@ static int loop_add(struct loop_device **l, int i) } else { err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL); } + mutex_unlock(&loop_ctl_mutex); if (err < 0) goto out_free_dev; i = err; - err = -ENOMEM; lo->tag_set.ops = &loop_mq_ops; lo->tag_set.nr_hw_queues = 1; - lo->tag_set.queue_depth = 128; + lo->tag_set.queue_depth = hw_queue_depth; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); if (err) goto out_free_idr; - lo->lo_queue = blk_mq_init_queue(&lo->tag_set); - if (IS_ERR(lo->lo_queue)) { - err = PTR_ERR(lo->lo_queue); + disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, &lim, lo); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); goto out_cleanup_tags; } - lo->lo_queue->queuedata = lo; - - blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS); - - /* - * By default, we do buffer IO, so it doesn't make sense to enable - * merge because the I/O submitted to backing file is handled page by - * page. For directio mode, merge does help to dispatch bigger request - * to underlayer disk. We will enable merge once directio is enabled. - */ - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue); - - err = -ENOMEM; - disk = lo->lo_disk = alloc_disk(1 << part_shift); - if (!disk) - goto out_free_queue; + lo->lo_queue = lo->lo_disk->queue; /* * Disable partition scanning by default. The in-kernel partition @@ -1985,27 +2073,42 @@ static int loop_add(struct loop_device **l, int i) * userspace tools. Parameters like this in general should be avoided. */ if (!part_shift) - disk->flags |= GENHD_FL_NO_PART_SCAN; - disk->flags |= GENHD_FL_EXT_DEVT; - atomic_set(&lo->lo_refcnt, 0); + set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); + mutex_init(&lo->lo_mutex); lo->lo_number = i; spin_lock_init(&lo->lo_lock); + spin_lock_init(&lo->lo_work_lock); + INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); + INIT_LIST_HEAD(&lo->rootcg_cmd_list); disk->major = LOOP_MAJOR; disk->first_minor = i << part_shift; + disk->minors = 1 << part_shift; disk->fops = &lo_fops; disk->private_data = lo; disk->queue = lo->lo_queue; + disk->events = DISK_EVENT_MEDIA_CHANGE; + disk->event_flags = DISK_EVENT_FLAG_UEVENT; sprintf(disk->disk_name, "loop%d", i); - add_disk(disk); - *l = lo; - return lo->lo_number; + /* Make this loop device reachable from pathname. */ + err = add_disk(disk); + if (err) + goto out_cleanup_disk; -out_free_queue: - blk_cleanup_queue(lo->lo_queue); + /* Show this loop device. */ + mutex_lock(&loop_ctl_mutex); + lo->idr_visible = true; + mutex_unlock(&loop_ctl_mutex); + + return i; + +out_cleanup_disk: + put_disk(disk); out_cleanup_tags: blk_mq_free_tag_set(&lo->tag_set); out_free_idr: + mutex_lock(&loop_ctl_mutex); idr_remove(&loop_index_idr, i); + mutex_unlock(&loop_ctl_mutex); out_free_dev: kfree(lo); out: @@ -2014,116 +2117,110 @@ out: static void loop_remove(struct loop_device *lo) { + /* Make this loop device unreachable from pathname. */ del_gendisk(lo->lo_disk); - blk_cleanup_queue(lo->lo_queue); blk_mq_free_tag_set(&lo->tag_set); + + mutex_lock(&loop_ctl_mutex); + idr_remove(&loop_index_idr, lo->lo_number); + mutex_unlock(&loop_ctl_mutex); + put_disk(lo->lo_disk); - kfree(lo); } -static int find_free_cb(int id, void *ptr, void *data) +#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD +static void loop_probe(dev_t dev) { - struct loop_device *lo = ptr; - struct loop_device **l = data; + int idx = MINOR(dev) >> part_shift; - if (lo->lo_state == Lo_unbound) { - *l = lo; - return 1; - } - return 0; + if (max_loop_specified && max_loop && idx >= max_loop) + return; + loop_add(idx); } +#else +#define loop_probe NULL +#endif /* !CONFIG_BLOCK_LEGACY_AUTOLOAD */ -static int loop_lookup(struct loop_device **l, int i) +static int loop_control_remove(int idx) { struct loop_device *lo; - int ret = -ENODEV; - - if (i < 0) { - int err; + int ret; - err = idr_for_each(&loop_index_idr, &find_free_cb, &lo); - if (err == 1) { - *l = lo; - ret = lo->lo_number; - } - goto out; + if (idx < 0) { + pr_warn_once("deleting an unspecified loop device is not supported.\n"); + return -EINVAL; } + + /* Hide this loop device for serialization. */ + ret = mutex_lock_killable(&loop_ctl_mutex); + if (ret) + return ret; + lo = idr_find(&loop_index_idr, idx); + if (!lo || !lo->idr_visible) + ret = -ENODEV; + else + lo->idr_visible = false; + mutex_unlock(&loop_ctl_mutex); + if (ret) + return ret; - /* lookup and return a specific i */ - lo = idr_find(&loop_index_idr, i); - if (lo) { - *l = lo; - ret = lo->lo_number; + /* Check whether this loop device can be removed. */ + ret = mutex_lock_killable(&lo->lo_mutex); + if (ret) + goto mark_visible; + if (lo->lo_state != Lo_unbound || disk_openers(lo->lo_disk) > 0) { + mutex_unlock(&lo->lo_mutex); + ret = -EBUSY; + goto mark_visible; } -out: - return ret; -} + /* Mark this loop device as no more bound, but not quite unbound yet */ + lo->lo_state = Lo_deleting; + mutex_unlock(&lo->lo_mutex); -static struct kobject *loop_probe(dev_t dev, int *part, void *data) -{ - struct loop_device *lo; - struct kobject *kobj; - int err; + loop_remove(lo); + return 0; +mark_visible: + /* Show this loop device again. */ mutex_lock(&loop_ctl_mutex); - err = loop_lookup(&lo, MINOR(dev) >> part_shift); - if (err < 0) - err = loop_add(&lo, MINOR(dev) >> part_shift); - if (err < 0) - kobj = NULL; - else - kobj = get_disk_and_module(lo->lo_disk); + lo->idr_visible = true; mutex_unlock(&loop_ctl_mutex); - - *part = 0; - return kobj; + return ret; } -static long loop_control_ioctl(struct file *file, unsigned int cmd, - unsigned long parm) +static int loop_control_get_free(int idx) { struct loop_device *lo; - int ret; + int id, ret; ret = mutex_lock_killable(&loop_ctl_mutex); if (ret) return ret; + idr_for_each_entry(&loop_index_idr, lo, id) { + /* Hitting a race results in creating a new loop device which is harmless. */ + if (lo->idr_visible && data_race(lo->lo_state) == Lo_unbound) + goto found; + } + mutex_unlock(&loop_ctl_mutex); + return loop_add(-1); +found: + mutex_unlock(&loop_ctl_mutex); + return id; +} - ret = -ENOSYS; +static long loop_control_ioctl(struct file *file, unsigned int cmd, + unsigned long parm) +{ switch (cmd) { case LOOP_CTL_ADD: - ret = loop_lookup(&lo, parm); - if (ret >= 0) { - ret = -EEXIST; - break; - } - ret = loop_add(&lo, parm); - break; + return loop_add(parm); case LOOP_CTL_REMOVE: - ret = loop_lookup(&lo, parm); - if (ret < 0) - break; - if (lo->lo_state != Lo_unbound) { - ret = -EBUSY; - break; - } - if (atomic_read(&lo->lo_refcnt) > 0) { - ret = -EBUSY; - break; - } - lo->lo_disk->private_data = NULL; - idr_remove(&loop_index_idr, lo->lo_number); - loop_remove(lo); - break; + return loop_control_remove(parm); case LOOP_CTL_GET_FREE: - ret = loop_lookup(&lo, -1); - if (ret >= 0) - break; - ret = loop_add(&lo, -1); + return loop_control_get_free(parm); + default: + return -ENOSYS; } - mutex_unlock(&loop_ctl_mutex); - - return ret; } static const struct file_operations loop_ctl_fops = { @@ -2145,9 +2242,7 @@ MODULE_ALIAS("devname:loop-control"); static int __init loop_init(void) { - int i, nr; - unsigned long range; - struct loop_device *lo; + int i; int err; part_shift = 0; @@ -2175,40 +2270,19 @@ static int __init loop_init(void) goto err_out; } - /* - * If max_loop is specified, create that many devices upfront. - * This also becomes a hard limit. If max_loop is not specified, - * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module - * init time. Loop devices can be requested on-demand with the - * /dev/loop-control interface, or be instantiated by accessing - * a 'dead' device node. - */ - if (max_loop) { - nr = max_loop; - range = max_loop << part_shift; - } else { - nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT; - range = 1UL << MINORBITS; - } - err = misc_register(&loop_misc); if (err < 0) goto err_out; - if (register_blkdev(LOOP_MAJOR, "loop")) { + if (__register_blkdev(LOOP_MAJOR, "loop", loop_probe)) { err = -EIO; goto misc_out; } - blk_register_region(MKDEV(LOOP_MAJOR, 0), range, - THIS_MODULE, loop_probe, NULL, NULL); - /* pre-create number of devices given by config or max_loop */ - mutex_lock(&loop_ctl_mutex); - for (i = 0; i < nr; i++) - loop_add(&lo, i); - mutex_unlock(&loop_ctl_mutex); + for (i = 0; i < max_loop; i++) + loop_add(i); printk(KERN_INFO "loop: module loaded\n"); return 0; @@ -2219,27 +2293,24 @@ err_out: return err; } -static int loop_exit_cb(int id, void *ptr, void *data) -{ - struct loop_device *lo = ptr; - - loop_remove(lo); - return 0; -} - static void __exit loop_exit(void) { - unsigned long range; - - range = max_loop ? max_loop << part_shift : 1UL << MINORBITS; - - idr_for_each(&loop_index_idr, &loop_exit_cb, NULL); - idr_destroy(&loop_index_idr); + struct loop_device *lo; + int id; - blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range); unregister_blkdev(LOOP_MAJOR, "loop"); - misc_deregister(&loop_misc); + + /* + * There is no need to use loop_ctl_mutex here, for nobody else can + * access loop_index_idr when this module is unloading (unless forced + * module unloading is requested). If this is not a clean unloading, + * we have no means to avoid kernel crash. + */ + idr_for_each_entry(&loop_index_idr, lo, id) + loop_remove(lo); + + idr_destroy(&loop_index_idr); } module_init(loop_init); @@ -2249,6 +2320,9 @@ module_exit(loop_exit); static int __init max_loop_setup(char *str) { max_loop = simple_strtol(str, NULL, 0); +#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD + max_loop_specified = true; +#endif return 1; } |
