diff options
Diffstat (limited to 'drivers/block/loop.c')
| -rw-r--r-- | drivers/block/loop.c | 1786 |
1 files changed, 785 insertions, 1001 deletions
diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f37b9e3d833c..272bc608e528 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1,54 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * linux/drivers/block/loop.c - * - * Written by Theodore Ts'o, 3/29/93 - * - * Copyright 1993 by Theodore Ts'o. Redistribution of this file is - * permitted under the GNU General Public License. - * - * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993 - * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996 - * - * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994 - * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996 - * - * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997 - * - * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998 - * - * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998 - * - * Loadable modules and other fixes by AK, 1998 - * - * Make real block number available to downstream transfer functions, enables - * CBC (and relatives) mode encryption requiring unique IVs per data block. - * Reed H. Petty, rhp@draper.net - * - * Maximum number of loop devices now dynamic via max_loop module parameter. - * Russell Kroll <rkroll@exploits.org> 19990701 - * - * Maximum number of loop devices when compiled-in now selectable by passing - * max_loop=<1-255> to the kernel on boot. - * Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999 - * - * Completely rewrite request handling to be make_request_fn style and - * non blocking, pushing work to a helper thread. Lots of fixes from - * Al Viro too. - * Jens Axboe <axboe@suse.de>, Nov 2000 - * - * Support up to 256 loop devices - * Heinz Mauelshagen <mge@sistina.com>, Feb 2002 - * - * Support for falling back on the write file operation when the address space - * operations write_begin is not available on the backing filesystem. - * Anton Altaparmakov, 16 Feb 2005 - * - * Still To Fix: - * - Advisory locking is ignored here. - * - Should use an own CAP_* category instead of CAP_SYS_ADMIN - * + * Copyright 1993 by Theodore Ts'o. */ - #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/sched.h> @@ -59,7 +12,6 @@ #include <linux/errno.h> #include <linux/major.h> #include <linux/wait.h> -#include <linux/blkdev.h> #include <linux/blkpg.h> #include <linux/init.h> #include <linux/swap.h> @@ -79,85 +31,141 @@ #include <linux/ioprio.h> #include <linux/blk-cgroup.h> #include <linux/sched/mm.h> +#include <linux/statfs.h> +#include <linux/uaccess.h> +#include <linux/blk-mq.h> +#include <linux/spinlock.h> +#include <uapi/linux/loop.h> + +/* Possible states of device */ +enum { + Lo_unbound, + Lo_bound, + Lo_rundown, + Lo_deleting, +}; -#include "loop.h" +struct loop_device { + int lo_number; + loff_t lo_offset; + loff_t lo_sizelimit; + int lo_flags; + char lo_file_name[LO_NAME_SIZE]; + + struct file *lo_backing_file; + unsigned int lo_min_dio_size; + struct block_device *lo_device; + + gfp_t old_gfp_mask; + + spinlock_t lo_lock; + int lo_state; + spinlock_t lo_work_lock; + struct workqueue_struct *workqueue; + struct work_struct rootcg_work; + struct list_head rootcg_cmd_list; + struct list_head idle_worker_list; + struct rb_root worker_tree; + struct timer_list timer; + bool sysfs_inited; + + struct request_queue *lo_queue; + struct blk_mq_tag_set tag_set; + struct gendisk *lo_disk; + struct mutex lo_mutex; + bool idr_visible; +}; -#include <linux/uaccess.h> +struct loop_cmd { + struct list_head list_entry; + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ + long ret; + struct kiocb iocb; + struct bio_vec *bvec; + struct cgroup_subsys_state *blkcg_css; + struct cgroup_subsys_state *memcg_css; +}; #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) +#define LOOP_DEFAULT_HW_Q_DEPTH 128 static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_ctl_mutex); +static DEFINE_MUTEX(loop_validate_mutex); -static int max_part; -static int part_shift; - -static int transfer_xor(struct loop_device *lo, int cmd, - struct page *raw_page, unsigned raw_off, - struct page *loop_page, unsigned loop_off, - int size, sector_t real_block) +/** + * loop_global_lock_killable() - take locks for safe loop_validate_file() test + * + * @lo: struct loop_device + * @global: true if @lo is about to bind another "struct loop_device", false otherwise + * + * Returns 0 on success, -EINTR otherwise. + * + * Since loop_validate_file() traverses on other "struct loop_device" if + * is_loop_device() is true, we need a global lock for serializing concurrent + * loop_configure()/loop_change_fd()/__loop_clr_fd() calls. + */ +static int loop_global_lock_killable(struct loop_device *lo, bool global) { - char *raw_buf = kmap_atomic(raw_page) + raw_off; - char *loop_buf = kmap_atomic(loop_page) + loop_off; - char *in, *out, *key; - int i, keysize; + int err; - if (cmd == READ) { - in = raw_buf; - out = loop_buf; - } else { - in = loop_buf; - out = raw_buf; + if (global) { + err = mutex_lock_killable(&loop_validate_mutex); + if (err) + return err; } - - key = lo->lo_encrypt_key; - keysize = lo->lo_encrypt_key_size; - for (i = 0; i < size; i++) - *out++ = *in++ ^ key[(i & 511) % keysize]; - - kunmap_atomic(loop_buf); - kunmap_atomic(raw_buf); - cond_resched(); - return 0; + err = mutex_lock_killable(&lo->lo_mutex); + if (err && global) + mutex_unlock(&loop_validate_mutex); + return err; } -static int xor_init(struct loop_device *lo, const struct loop_info64 *info) +/** + * loop_global_unlock() - release locks taken by loop_global_lock_killable() + * + * @lo: struct loop_device + * @global: true if @lo was about to bind another "struct loop_device", false otherwise + */ +static void loop_global_unlock(struct loop_device *lo, bool global) { - if (unlikely(info->lo_encrypt_key_size <= 0)) - return -EINVAL; - return 0; + mutex_unlock(&lo->lo_mutex); + if (global) + mutex_unlock(&loop_validate_mutex); } -static struct loop_func_table none_funcs = { - .number = LO_CRYPT_NONE, -}; - -static struct loop_func_table xor_funcs = { - .number = LO_CRYPT_XOR, - .transfer = transfer_xor, - .init = xor_init -}; - -/* xfer_funcs[0] is special - its release function is never called */ -static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = { - &none_funcs, - &xor_funcs -}; +static int max_part; +static int part_shift; -static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file) +static loff_t lo_calculate_size(struct loop_device *lo, struct file *file) { loff_t loopsize; + int ret; + + if (S_ISBLK(file_inode(file)->i_mode)) { + loopsize = i_size_read(file->f_mapping->host); + } else { + struct kstat stat; + + /* + * Get the accurate file size. This provides better results than + * cached inode data, particularly for network filesystems where + * metadata may be stale. + */ + ret = vfs_getattr_nosec(&file->f_path, &stat, STATX_SIZE, 0); + if (ret) + return 0; + + loopsize = stat.size; + } - /* Compute loopsize in bytes */ - loopsize = i_size_read(file->f_mapping->host); - if (offset > 0) - loopsize -= offset; + if (lo->lo_offset > 0) + loopsize -= lo->lo_offset; /* offset is beyond i_size, weird but possible */ if (loopsize < 0) return 0; - - if (sizelimit > 0 && sizelimit < loopsize) - loopsize = sizelimit; + if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize) + loopsize = lo->lo_sizelimit; /* * Unfortunately, if we want to do I/O on the device, * the number of 512-byte sectors has to fit into a sector_t. @@ -165,83 +173,38 @@ static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file) return loopsize >> 9; } -static loff_t get_loop_size(struct loop_device *lo, struct file *file) +/* + * We support direct I/O only if lo_offset is aligned with the logical I/O size + * of backing device, and the logical block size of loop is bigger than that of + * the backing device. + */ +static bool lo_can_use_dio(struct loop_device *lo) { - return get_size(lo->lo_offset, lo->lo_sizelimit, file); + if (!(lo->lo_backing_file->f_mode & FMODE_CAN_ODIRECT)) + return false; + if (queue_logical_block_size(lo->lo_queue) < lo->lo_min_dio_size) + return false; + if (lo->lo_offset & (lo->lo_min_dio_size - 1)) + return false; + return true; } -static void __loop_update_dio(struct loop_device *lo, bool dio) -{ - struct file *file = lo->lo_backing_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - unsigned short sb_bsize = 0; - unsigned dio_align = 0; - bool use_dio; - - if (inode->i_sb->s_bdev) { - sb_bsize = bdev_logical_block_size(inode->i_sb->s_bdev); - dio_align = sb_bsize - 1; - } - - /* - * We support direct I/O only if lo_offset is aligned with the - * logical I/O size of backing device, and the logical block - * size of loop is bigger than the backing device's and the loop - * needn't transform transfer. - * - * TODO: the above condition may be loosed in the future, and - * direct I/O may be switched runtime at that time because most - * of requests in sane applications should be PAGE_SIZE aligned - */ - if (dio) { - if (queue_logical_block_size(lo->lo_queue) >= sb_bsize && - !(lo->lo_offset & dio_align) && - mapping->a_ops->direct_IO && - !lo->transfer) - use_dio = true; - else - use_dio = false; - } else { - use_dio = false; - } - - if (lo->use_dio == use_dio) - return; - - /* flush dirty pages before changing direct IO */ - vfs_fsync(file, 0); - - /* - * The flag of LO_FLAGS_DIRECT_IO is handled similarly with - * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup - * will get updated by ioctl(LOOP_GET_STATUS) - */ - if (lo->lo_state == Lo_bound) - blk_mq_freeze_queue(lo->lo_queue); - lo->use_dio = use_dio; - if (use_dio) { - blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, lo->lo_queue); - lo->lo_flags |= LO_FLAGS_DIRECT_IO; - } else { - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue); - lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; - } - if (lo->lo_state == Lo_bound) - blk_mq_unfreeze_queue(lo->lo_queue); -} - -/** - * loop_validate_block_size() - validates the passed in block size - * @bsize: size to validate +/* + * Direct I/O can be enabled either by using an O_DIRECT file descriptor, or by + * passing in the LO_FLAGS_DIRECT_IO flag from userspace. It will be silently + * disabled when the device block size is too small or the offset is unaligned. + * + * loop_get_status will always report the effective LO_FLAGS_DIRECT_IO flag and + * not the originally passed in one. */ -static int -loop_validate_block_size(unsigned short bsize) +static inline void loop_update_dio(struct loop_device *lo) { - if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) - return -EINVAL; + lockdep_assert_held(&lo->lo_mutex); + WARN_ON_ONCE(lo->lo_state == Lo_bound && + lo->lo_queue->mq_freeze_depth == 0); - return 0; + if ((lo->lo_flags & LO_FLAGS_DIRECT_IO) && !lo_can_use_dio(lo)) + lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; } /** @@ -258,175 +221,26 @@ static void loop_set_size(struct loop_device *lo, loff_t size) kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); } -static inline int -lo_do_transfer(struct loop_device *lo, int cmd, - struct page *rpage, unsigned roffs, - struct page *lpage, unsigned loffs, - int size, sector_t rblock) +static void loop_clear_limits(struct loop_device *lo, int mode) { - int ret; - - ret = lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock); - if (likely(!ret)) - return 0; - - printk_ratelimited(KERN_ERR - "loop: Transfer error at byte offset %llu, length %i.\n", - (unsigned long long)rblock << 9, size); - return ret; -} - -static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos) -{ - struct iov_iter i; - ssize_t bw; + struct queue_limits lim = queue_limits_start_update(lo->lo_queue); - iov_iter_bvec(&i, WRITE, bvec, 1, bvec->bv_len); + if (mode & FALLOC_FL_ZERO_RANGE) + lim.max_write_zeroes_sectors = 0; - file_start_write(file); - bw = vfs_iter_write(file, &i, ppos, 0); - file_end_write(file); - - if (likely(bw == bvec->bv_len)) - return 0; - - printk_ratelimited(KERN_ERR - "loop: Write error at byte offset %llu, length %i.\n", - (unsigned long long)*ppos, bvec->bv_len); - if (bw >= 0) - bw = -EIO; - return bw; -} - -static int lo_write_simple(struct loop_device *lo, struct request *rq, - loff_t pos) -{ - struct bio_vec bvec; - struct req_iterator iter; - int ret = 0; - - rq_for_each_segment(bvec, rq, iter) { - ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos); - if (ret < 0) - break; - cond_resched(); + if (mode & FALLOC_FL_PUNCH_HOLE) { + lim.max_hw_discard_sectors = 0; + lim.discard_granularity = 0; } - return ret; -} - -/* - * This is the slow, transforming version that needs to double buffer the - * data as it cannot do the transformations in place without having direct - * access to the destination pages of the backing file. - */ -static int lo_write_transfer(struct loop_device *lo, struct request *rq, - loff_t pos) -{ - struct bio_vec bvec, b; - struct req_iterator iter; - struct page *page; - int ret = 0; - - page = alloc_page(GFP_NOIO); - if (unlikely(!page)) - return -ENOMEM; - - rq_for_each_segment(bvec, rq, iter) { - ret = lo_do_transfer(lo, WRITE, page, 0, bvec.bv_page, - bvec.bv_offset, bvec.bv_len, pos >> 9); - if (unlikely(ret)) - break; - - b.bv_page = page; - b.bv_offset = 0; - b.bv_len = bvec.bv_len; - ret = lo_write_bvec(lo->lo_backing_file, &b, &pos); - if (ret < 0) - break; - } - - __free_page(page); - return ret; -} - -static int lo_read_simple(struct loop_device *lo, struct request *rq, - loff_t pos) -{ - struct bio_vec bvec; - struct req_iterator iter; - struct iov_iter i; - ssize_t len; - - rq_for_each_segment(bvec, rq, iter) { - iov_iter_bvec(&i, READ, &bvec, 1, bvec.bv_len); - len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); - if (len < 0) - return len; - - flush_dcache_page(bvec.bv_page); - - if (len != bvec.bv_len) { - struct bio *bio; - - __rq_for_each_bio(bio, rq) - zero_fill_bio(bio); - break; - } - cond_resched(); - } - - return 0; -} - -static int lo_read_transfer(struct loop_device *lo, struct request *rq, - loff_t pos) -{ - struct bio_vec bvec, b; - struct req_iterator iter; - struct iov_iter i; - struct page *page; - ssize_t len; - int ret = 0; - - page = alloc_page(GFP_NOIO); - if (unlikely(!page)) - return -ENOMEM; - - rq_for_each_segment(bvec, rq, iter) { - loff_t offset = pos; - - b.bv_page = page; - b.bv_offset = 0; - b.bv_len = bvec.bv_len; - - iov_iter_bvec(&i, READ, &b, 1, b.bv_len); - len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); - if (len < 0) { - ret = len; - goto out_free_page; - } - - ret = lo_do_transfer(lo, READ, page, 0, bvec.bv_page, - bvec.bv_offset, len, offset >> 9); - if (ret) - goto out_free_page; - - flush_dcache_page(bvec.bv_page); - - if (len != bvec.bv_len) { - struct bio *bio; - - __rq_for_each_bio(bio, rq) - zero_fill_bio(bio); - break; - } - } - - ret = 0; -out_free_page: - __free_page(page); - return ret; + /* + * XXX: this updates the queue limits without freezing the queue, which + * is against the locking protocol and dangerous. But we can't just + * freeze the queue as we're inside the ->queue_rq method here. So this + * should move out into a workqueue unless we get the file operations to + * advertise if they support specific fallocate operations. + */ + queue_limits_commit_update(lo->lo_queue, &lim); } static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, @@ -434,32 +248,33 @@ static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, { /* * We use fallocate to manipulate the space mappings used by the image - * a.k.a. discard/zerorange. However we do not support this if - * encryption is enabled, because it may give an attacker useful - * information. + * a.k.a. discard/zerorange. */ struct file *file = lo->lo_backing_file; - struct request_queue *q = lo->lo_queue; int ret; mode |= FALLOC_FL_KEEP_SIZE; - if (!blk_queue_discard(q)) { - ret = -EOPNOTSUPP; - goto out; - } + if (!bdev_max_discard_sectors(lo->lo_device)) + return -EOPNOTSUPP; ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq)); if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) - ret = -EIO; - out: + return -EIO; + + /* + * We initially configure the limits in a hope that fallocate is + * supported and clear them here if that turns out not to be true. + */ + if (unlikely(ret == -EOPNOTSUPP)) + loop_clear_limits(lo, mode); + return ret; } static int lo_req_flush(struct loop_device *lo, struct request *rq) { - struct file *file = lo->lo_backing_file; - int ret = vfs_fsync(file, 0); + int ret = vfs_fsync(lo->lo_backing_file, 0); if (unlikely(ret && ret != -EINVAL)) ret = -EIO; @@ -471,7 +286,7 @@ static void lo_complete_rq(struct request *rq) struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); blk_status_t ret = BLK_STS_OK; - if (!cmd->use_aio || cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) || + if (cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) || req_op(rq) != REQ_OP_READ) { if (cmd->ret < 0) ret = errno_to_blk_status(cmd->ret); @@ -487,14 +302,13 @@ static void lo_complete_rq(struct request *rq) cmd->ret = 0; blk_mq_requeue_request(rq, true); } else { - if (cmd->use_aio) { - struct bio *bio = rq->bio; + struct bio *bio = rq->bio; - while (bio) { - zero_fill_bio(bio); - bio = bio->bi_next; - } + while (bio) { + zero_fill_bio(bio); + bio = bio->bi_next; } + ret = BLK_STS_IOERR; end_io: blk_mq_end_request(rq, ret); @@ -509,11 +323,13 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd) return; kfree(cmd->bvec); cmd->bvec = NULL; + if (req_op(rq) == REQ_OP_WRITE) + kiocb_end_write(&cmd->iocb); if (likely(!blk_should_fake_timeout(rq->q))) blk_mq_complete_request(rq); } -static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) +static void lo_rw_aio_complete(struct kiocb *iocb, long ret) { struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); @@ -522,7 +338,7 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) } static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, - loff_t pos, bool rw) + loff_t pos, int rw) { struct iov_iter iter; struct req_iterator rq_iter; @@ -532,11 +348,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, struct file *file = lo->lo_backing_file; struct bio_vec tmp; unsigned int offset; - int nr_bvec = 0; + unsigned int nr_bvec; int ret; - rq_for_each_bvec(tmp, rq, rq_iter) - nr_bvec++; + nr_bvec = blk_rq_nr_bvec(rq); if (rq->bio != rq->biotail) { @@ -574,20 +389,26 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, cmd->iocb.ki_pos = pos; cmd->iocb.ki_filp = file; - cmd->iocb.ki_complete = lo_rw_aio_complete; - cmd->iocb.ki_flags = IOCB_DIRECT; - cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); + cmd->iocb.ki_ioprio = req_get_ioprio(rq); + if (cmd->use_aio) { + cmd->iocb.ki_complete = lo_rw_aio_complete; + cmd->iocb.ki_flags = IOCB_DIRECT; + } else { + cmd->iocb.ki_complete = NULL; + cmd->iocb.ki_flags = 0; + } - if (rw == WRITE) - ret = call_write_iter(file, &cmd->iocb, &iter); - else - ret = call_read_iter(file, &cmd->iocb, &iter); + if (rw == ITER_SOURCE) { + kiocb_start_write(&cmd->iocb); + ret = file->f_op->write_iter(&cmd->iocb, &iter); + } else + ret = file->f_op->read_iter(&cmd->iocb, &iter); lo_rw_aio_do_completion(cmd); if (ret != -EIOCBQUEUED) - cmd->iocb.ki_complete(&cmd->iocb, ret, 0); - return 0; + lo_rw_aio_complete(&cmd->iocb, ret); + return -EIOCBQUEUED; } static int do_req_filebacked(struct loop_device *lo, struct request *rq) @@ -595,15 +416,6 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; - /* - * lo_write_simple and lo_read_simple should have been covered - * by io submit style function like lo_rw_aio(), one blocker - * is that lo_read_simple() need to call flush_dcache_page after - * the page is written from kernel, and it isn't easy to handle - * this in io submit style function which submits all segments - * of the req at one time. And direct read IO doesn't need to - * run flush_dcache_page(). - */ switch (req_op(rq)) { case REQ_OP_FLUSH: return lo_req_flush(lo, rq); @@ -619,31 +431,15 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) case REQ_OP_DISCARD: return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE); case REQ_OP_WRITE: - if (lo->transfer) - return lo_write_transfer(lo, rq, pos); - else if (cmd->use_aio) - return lo_rw_aio(lo, cmd, pos, WRITE); - else - return lo_write_simple(lo, rq, pos); + return lo_rw_aio(lo, cmd, pos, ITER_SOURCE); case REQ_OP_READ: - if (lo->transfer) - return lo_read_transfer(lo, rq, pos); - else if (cmd->use_aio) - return lo_rw_aio(lo, cmd, pos, READ); - else - return lo_read_simple(lo, rq, pos); + return lo_rw_aio(lo, cmd, pos, ITER_DEST); default: WARN_ON_ONCE(1); return -EIO; } } -static inline void loop_update_dio(struct loop_device *lo) -{ - __loop_update_dio(lo, (lo->lo_backing_file->f_flags & O_DIRECT) | - lo->use_dio); -} - static void loop_reread_partitions(struct loop_device *lo) { int rc; @@ -656,6 +452,28 @@ static void loop_reread_partitions(struct loop_device *lo) __func__, lo->lo_number, lo->lo_file_name, rc); } +static unsigned int loop_query_min_dio_size(struct loop_device *lo) +{ + struct file *file = lo->lo_backing_file; + struct block_device *sb_bdev = file->f_mapping->host->i_sb->s_bdev; + struct kstat st; + + /* + * Use the minimal dio alignment of the file system if provided. + */ + if (!vfs_getattr(&file->f_path, &st, STATX_DIOALIGN, 0) && + (st.result_mask & STATX_DIOALIGN)) + return st.dio_offset_align; + + /* + * In a perfect world this wouldn't be needed, but as of Linux 6.13 only + * a handful of file systems support the STATX_DIOALIGN flag. + */ + if (sb_bdev) + return bdev_logical_block_size(sb_bdev); + return SECTOR_SIZE; +} + static inline int is_loop_device(struct file *file) { struct inode *i = file->f_mapping->host; @@ -672,13 +490,15 @@ static int loop_validate_file(struct file *file, struct block_device *bdev) while (is_loop_device(f)) { struct loop_device *l; + lockdep_assert_held(&loop_validate_mutex); if (f->f_mapping->host->i_rdev == bdev->bd_dev) return -EBADF; l = I_BDEV(f->f_mapping->host)->bd_disk->private_data; - if (l->lo_state != Lo_bound) { + if (l->lo_state != Lo_bound) return -EINVAL; - } + /* Order wrt setting lo->lo_backing_file in loop_configure(). */ + rmb(); f = l->lo_backing_file; } if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) @@ -686,6 +506,28 @@ static int loop_validate_file(struct file *file, struct block_device *bdev) return 0; } +static void loop_assign_backing_file(struct loop_device *lo, struct file *file) +{ + lo->lo_backing_file = file; + lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping); + mapping_set_gfp_mask(file->f_mapping, + lo->old_gfp_mask & ~(__GFP_IO | __GFP_FS)); + if (lo->lo_backing_file->f_flags & O_DIRECT) + lo->lo_flags |= LO_FLAGS_DIRECT_IO; + lo->lo_min_dio_size = loop_query_min_dio_size(lo); +} + +static int loop_check_backing_file(struct file *file) +{ + if (!file->f_op->read_iter) + return -EINVAL; + + if ((file->f_mode & FMODE_WRITE) && !file->f_op->write_iter) + return -EINVAL; + + return 0; +} + /* * loop_change_fd switched the backing store of a loopback device to * a new file. This is useful for operating system installers to free up @@ -697,13 +539,29 @@ static int loop_validate_file(struct file *file, struct block_device *bdev) static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, unsigned int arg) { - struct file *file = NULL, *old_file; - int error; - bool partscan; + struct file *file = fget(arg); + struct file *old_file; + unsigned int memflags; + int error; + bool partscan; + bool is_loop; - error = mutex_lock_killable(&lo->lo_mutex); - if (error) + if (!file) + return -EBADF; + + error = loop_check_backing_file(file); + if (error) { + fput(file); return error; + } + + /* suppress uevents while reconfiguring the device */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); + + is_loop = is_loop_device(file); + error = loop_global_lock_killable(lo, is_loop); + if (error) + goto out_putf; error = -ENXIO; if (lo->lo_state != Lo_bound) goto out_err; @@ -713,11 +571,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, if (!(lo->lo_flags & LO_FLAGS_READ_ONLY)) goto out_err; - error = -EBADF; - file = fget(arg); - if (!file) - goto out_err; - error = loop_validate_file(file, bdev); if (error) goto out_err; @@ -727,35 +580,55 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, error = -EINVAL; /* size of the new backing store needs to be the same */ - if (get_loop_size(lo, file) != get_loop_size(lo, old_file)) + if (lo_calculate_size(lo, file) != lo_calculate_size(lo, old_file)) goto out_err; + /* + * We might switch to direct I/O mode for the loop device, write back + * all dirty data the page cache now that so that the individual I/O + * operations don't have to do that. + */ + vfs_fsync(file, 0); + /* and ... switch */ - blk_mq_freeze_queue(lo->lo_queue); + disk_force_media_change(lo->lo_disk); + memflags = blk_mq_freeze_queue(lo->lo_queue); mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); - lo->lo_backing_file = file; - lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping); - mapping_set_gfp_mask(file->f_mapping, - lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); + loop_assign_backing_file(lo, file); loop_update_dio(lo); - blk_mq_unfreeze_queue(lo->lo_queue); + blk_mq_unfreeze_queue(lo->lo_queue, memflags); partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, is_loop); + + /* + * Flush loop_validate_file() before fput(), for l->lo_backing_file + * might be pointing at old_file which might be the last reference. + */ + if (!is_loop) { + mutex_lock(&loop_validate_mutex); + mutex_unlock(&loop_validate_mutex); + } /* * We must drop file reference outside of lo_mutex as dropping * the file ref can take open_mutex which creates circular locking * dependency. */ fput(old_file); + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); if (partscan) loop_reread_partitions(lo); - return 0; -out_err: - mutex_unlock(&lo->lo_mutex); - if (file) - fput(file); + error = 0; +done: + kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); return error; + +out_err: + loop_global_unlock(lo, is_loop); +out_putf: + fput(file); + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); + goto done; } /* loop sysfs attributes */ @@ -803,33 +676,33 @@ static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf) static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf) { - return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset); + return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_offset); } static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf) { - return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); + return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); } static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf) { int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR); - return sprintf(buf, "%s\n", autoclear ? "1" : "0"); + return sysfs_emit(buf, "%s\n", autoclear ? "1" : "0"); } static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf) { int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN); - return sprintf(buf, "%s\n", partscan ? "1" : "0"); + return sysfs_emit(buf, "%s\n", partscan ? "1" : "0"); } static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf) { int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO); - return sprintf(buf, "%s\n", dio ? "1" : "0"); + return sysfs_emit(buf, "%s\n", dio ? "1" : "0"); } LOOP_ATTR_RO(backing_file); @@ -867,12 +740,12 @@ static void loop_sysfs_exit(struct loop_device *lo) &loop_attribute_group); } -static void loop_config_discard(struct loop_device *lo) +static void loop_get_discard_config(struct loop_device *lo, + u32 *granularity, u32 *max_discard_sectors) { struct file *file = lo->lo_backing_file; struct inode *inode = file->f_mapping->host; - struct request_queue *q = lo->lo_queue; - u32 granularity, max_discard_sectors; + struct kstatfs sbuf; /* * If the backing device is a block device, mirror its zeroing @@ -881,40 +754,20 @@ static void loop_config_discard(struct loop_device *lo) * not blkdev_issue_discard(). This maintains consistent behavior with * file-backed loop devices: discarded regions read back as zero. */ - if (S_ISBLK(inode->i_mode) && !lo->lo_encrypt_key_size) { - struct request_queue *backingq = bdev_get_queue(I_BDEV(inode)); + if (S_ISBLK(inode->i_mode)) { + struct block_device *bdev = I_BDEV(inode); - max_discard_sectors = backingq->limits.max_write_zeroes_sectors; - granularity = backingq->limits.discard_granularity ?: - queue_physical_block_size(backingq); + *max_discard_sectors = bdev_write_zeroes_sectors(bdev); + *granularity = bdev_discard_granularity(bdev); /* * We use punch hole to reclaim the free space used by the - * image a.k.a. discard. However we do not support discard if - * encryption is enabled, because it may give an attacker - * useful information. + * image a.k.a. discard. */ - } else if (!file->f_op->fallocate || lo->lo_encrypt_key_size) { - max_discard_sectors = 0; - granularity = 0; - - } else { - max_discard_sectors = UINT_MAX >> 9; - granularity = inode->i_sb->s_blocksize; - } - - if (max_discard_sectors) { - q->limits.discard_granularity = granularity; - blk_queue_max_discard_sectors(q, max_discard_sectors); - blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); - } else { - q->limits.discard_granularity = 0; - blk_queue_max_discard_sectors(q, 0); - blk_queue_max_write_zeroes_sectors(q, 0); - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q); + } else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) { + *max_discard_sectors = UINT_MAX >> 9; + *granularity = sbuf.f_bsize; } - q->limits.discard_alignment = 0; } struct loop_worker { @@ -928,8 +781,6 @@ struct loop_worker { }; static void loop_workfn(struct work_struct *work); -static void loop_rootcg_workfn(struct work_struct *work); -static void loop_free_idle_workers(struct timer_list *timer); #ifdef CONFIG_BLK_CGROUP static inline int queue_on_root_worker(struct cgroup_subsys_state *css) @@ -945,7 +796,7 @@ static inline int queue_on_root_worker(struct cgroup_subsys_state *css) static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) { - struct rb_node **node = &(lo->worker_tree.rb_node), *parent = NULL; + struct rb_node **node, *parent = NULL; struct loop_worker *cur_worker, *worker = NULL; struct work_struct *work; struct list_head *cmd_list; @@ -972,7 +823,7 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) if (worker) goto queue_work; - worker = kzalloc(sizeof(struct loop_worker), GFP_NOWAIT | __GFP_NOWARN); + worker = kzalloc(sizeof(struct loop_worker), GFP_NOWAIT); /* * In the event we cannot allocate a worker, just queue on the * rootcg worker and issue the I/O as the rootcg @@ -1013,59 +864,37 @@ queue_work: spin_unlock_irq(&lo->lo_work_lock); } -static void loop_update_rotational(struct loop_device *lo) +static void loop_set_timer(struct loop_device *lo) { - struct file *file = lo->lo_backing_file; - struct inode *file_inode = file->f_mapping->host; - struct block_device *file_bdev = file_inode->i_sb->s_bdev; - struct request_queue *q = lo->lo_queue; - bool nonrot = true; - - /* not all filesystems (e.g. tmpfs) have a sb->s_bdev */ - if (file_bdev) - nonrot = blk_queue_nonrot(bdev_get_queue(file_bdev)); - - if (nonrot) - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - else - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); + timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT); } -static int -loop_release_xfer(struct loop_device *lo) +static void loop_free_idle_workers(struct loop_device *lo, bool delete_all) { - int err = 0; - struct loop_func_table *xfer = lo->lo_encryption; - - if (xfer) { - if (xfer->release) - err = xfer->release(lo); - lo->transfer = NULL; - lo->lo_encryption = NULL; - module_put(xfer->owner); + struct loop_worker *pos, *worker; + + spin_lock_irq(&lo->lo_work_lock); + list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, + idle_list) { + if (!delete_all && + time_is_after_jiffies(worker->last_ran_at + + LOOP_IDLE_WORKER_TIMEOUT)) + break; + list_del(&worker->idle_list); + rb_erase(&worker->rb_node, &lo->worker_tree); + css_put(worker->blkcg_css); + kfree(worker); } - return err; + if (!list_empty(&lo->idle_worker_list)) + loop_set_timer(lo); + spin_unlock_irq(&lo->lo_work_lock); } -static int -loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer, - const struct loop_info64 *i) +static void loop_free_idle_workers_timer(struct timer_list *timer) { - int err = 0; - - if (xfer) { - struct module *owner = xfer->owner; + struct loop_device *lo = container_of(timer, struct loop_device, timer); - if (!try_module_get(owner)) - return -EINVAL; - if (xfer->init) - err = xfer->init(lo, i); - if (err) - module_put(owner); - else - lo->lo_encryption = xfer; - } - return err; + return loop_free_idle_workers(lo, false); } /** @@ -1080,89 +909,112 @@ static int loop_set_status_from_info(struct loop_device *lo, const struct loop_info64 *info) { - int err; - struct loop_func_table *xfer; - kuid_t uid = current_uid(); - if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) return -EINVAL; - err = loop_release_xfer(lo); - if (err) - return err; - - if (info->lo_encrypt_type) { - unsigned int type = info->lo_encrypt_type; - - if (type >= MAX_LO_CRYPT) - return -EINVAL; - xfer = xfer_funcs[type]; - if (xfer == NULL) - return -EINVAL; - } else - xfer = NULL; + switch (info->lo_encrypt_type) { + case LO_CRYPT_NONE: + break; + case LO_CRYPT_XOR: + pr_warn("support for the xor transformation has been removed.\n"); + return -EINVAL; + case LO_CRYPT_CRYPTOAPI: + pr_warn("support for cryptoloop has been removed. Use dm-crypt instead.\n"); + return -EINVAL; + default: + return -EINVAL; + } - err = loop_init_xfer(lo, xfer, info); - if (err) - return err; + /* Avoid assigning overflow values */ + if (info->lo_offset > LLONG_MAX || info->lo_sizelimit > LLONG_MAX) + return -EOVERFLOW; lo->lo_offset = info->lo_offset; lo->lo_sizelimit = info->lo_sizelimit; + memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); - memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE); lo->lo_file_name[LO_NAME_SIZE-1] = 0; - lo->lo_crypt_name[LO_NAME_SIZE-1] = 0; - - if (!xfer) - xfer = &none_funcs; - lo->transfer = xfer->transfer; - lo->ioctl = xfer->ioctl; - - lo->lo_flags = info->lo_flags; - - lo->lo_encrypt_key_size = info->lo_encrypt_key_size; - lo->lo_init[0] = info->lo_init[0]; - lo->lo_init[1] = info->lo_init[1]; - if (info->lo_encrypt_key_size) { - memcpy(lo->lo_encrypt_key, info->lo_encrypt_key, - info->lo_encrypt_key_size); - lo->lo_key_owner = uid; - } - return 0; } -static int loop_configure(struct loop_device *lo, fmode_t mode, +static unsigned int loop_default_blocksize(struct loop_device *lo) +{ + /* In case of direct I/O, match underlying minimum I/O size */ + if (lo->lo_flags & LO_FLAGS_DIRECT_IO) + return lo->lo_min_dio_size; + return SECTOR_SIZE; +} + +static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim, + unsigned int bsize) +{ + struct file *file = lo->lo_backing_file; + struct inode *inode = file->f_mapping->host; + struct block_device *backing_bdev = NULL; + u32 granularity = 0, max_discard_sectors = 0; + + if (S_ISBLK(inode->i_mode)) + backing_bdev = I_BDEV(inode); + else if (inode->i_sb->s_bdev) + backing_bdev = inode->i_sb->s_bdev; + + if (!bsize) + bsize = loop_default_blocksize(lo); + + loop_get_discard_config(lo, &granularity, &max_discard_sectors); + + lim->logical_block_size = bsize; + lim->physical_block_size = bsize; + lim->io_min = bsize; + lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL); + if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY)) + lim->features |= BLK_FEAT_WRITE_CACHE; + if (backing_bdev && !bdev_nonrot(backing_bdev)) + lim->features |= BLK_FEAT_ROTATIONAL; + lim->max_hw_discard_sectors = max_discard_sectors; + lim->max_write_zeroes_sectors = max_discard_sectors; + if (max_discard_sectors) + lim->discard_granularity = granularity; + else + lim->discard_granularity = 0; +} + +static int loop_configure(struct loop_device *lo, blk_mode_t mode, struct block_device *bdev, const struct loop_config *config) { - struct file *file; - struct inode *inode; - struct address_space *mapping; - int error; - loff_t size; - bool partscan; - unsigned short bsize; + struct file *file = fget(config->fd); + struct queue_limits lim; + int error; + loff_t size; + bool partscan; + bool is_loop; + + if (!file) + return -EBADF; + + error = loop_check_backing_file(file); + if (error) { + fput(file); + return error; + } + + is_loop = is_loop_device(file); /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); - error = -EBADF; - file = fget(config->fd); - if (!file) - goto out; - /* * If we don't hold exclusive handle for the device, upgrade to it * here to avoid changing device under exclusive owner. */ - if (!(mode & FMODE_EXCL)) { - error = bd_prepare_to_claim(bdev, loop_configure); + if (!(mode & BLK_OPEN_EXCL)) { + error = bd_prepare_to_claim(bdev, loop_configure, NULL); if (error) goto out_putf; } - error = mutex_lock_killable(&lo->lo_mutex); + error = loop_global_lock_killable(lo, is_loop); if (error) goto out_bdev; @@ -1174,184 +1026,134 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, if (error) goto out_unlock; - mapping = file->f_mapping; - inode = mapping->host; - if ((config->info.lo_flags & ~LOOP_CONFIGURE_SETTABLE_FLAGS) != 0) { error = -EINVAL; goto out_unlock; } - if (config->block_size) { - error = loop_validate_block_size(config->block_size); - if (error) - goto out_unlock; - } - error = loop_set_status_from_info(lo, &config->info); if (error) goto out_unlock; + lo->lo_flags = config->info.lo_flags; - if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) || + if (!(file->f_mode & FMODE_WRITE) || !(mode & BLK_OPEN_WRITE) || !file->f_op->write_iter) lo->lo_flags |= LO_FLAGS_READ_ONLY; - lo->workqueue = alloc_workqueue("loop%d", - WQ_UNBOUND | WQ_FREEZABLE, - 0, - lo->lo_number); if (!lo->workqueue) { - error = -ENOMEM; - goto out_unlock; + lo->workqueue = alloc_workqueue("loop%d", + WQ_UNBOUND | WQ_FREEZABLE, + 0, lo->lo_number); + if (!lo->workqueue) { + error = -ENOMEM; + goto out_unlock; + } } + /* suppress uevents while reconfiguring the device */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); + + disk_force_media_change(lo->lo_disk); set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); - INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); - INIT_LIST_HEAD(&lo->rootcg_cmd_list); - INIT_LIST_HEAD(&lo->idle_worker_list); - lo->worker_tree = RB_ROOT; - timer_setup(&lo->timer, loop_free_idle_workers, - TIMER_DEFERRABLE); - lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO; lo->lo_device = bdev; - lo->lo_backing_file = file; - lo->old_gfp_mask = mapping_gfp_mask(mapping); - mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); + loop_assign_backing_file(lo, file); - if (!(lo->lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) - blk_queue_write_cache(lo->lo_queue, true, false); - - if (config->block_size) - bsize = config->block_size; - else if ((lo->lo_backing_file->f_flags & O_DIRECT) && inode->i_sb->s_bdev) - /* In case of direct I/O, match underlying block size */ - bsize = bdev_logical_block_size(inode->i_sb->s_bdev); - else - bsize = 512; + lim = queue_limits_start_update(lo->lo_queue); + loop_update_limits(lo, &lim, config->block_size); + /* No need to freeze the queue as the device isn't bound yet. */ + error = queue_limits_commit_update(lo->lo_queue, &lim); + if (error) + goto out_unlock; - blk_queue_logical_block_size(lo->lo_queue, bsize); - blk_queue_physical_block_size(lo->lo_queue, bsize); - blk_queue_io_min(lo->lo_queue, bsize); + /* + * We might switch to direct I/O mode for the loop device, write back + * all dirty data the page cache now that so that the individual I/O + * operations don't have to do that. + */ + vfs_fsync(file, 0); - loop_config_discard(lo); - loop_update_rotational(lo); loop_update_dio(lo); loop_sysfs_init(lo); - size = get_loop_size(lo, file); + size = lo_calculate_size(lo, file); loop_set_size(lo, size); + /* Order wrt reading lo_state in loop_validate_file(). */ + wmb(); + lo->lo_state = Lo_bound; if (part_shift) lo->lo_flags |= LO_FLAGS_PARTSCAN; partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; if (partscan) - lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; + clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); - /* Grab the block_device to prevent its destruction after we - * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev). - */ - bdgrab(bdev); - mutex_unlock(&lo->lo_mutex); + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); + kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); + + loop_global_unlock(lo, is_loop); if (partscan) loop_reread_partitions(lo); - if (!(mode & FMODE_EXCL)) + + if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(bdev, loop_configure); + return 0; out_unlock: - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, is_loop); out_bdev: - if (!(mode & FMODE_EXCL)) + if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(bdev, loop_configure); out_putf: fput(file); -out: /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); return error; } -static int __loop_clr_fd(struct loop_device *lo, bool release) +static void __loop_clr_fd(struct loop_device *lo) { - struct file *filp = NULL; + struct queue_limits lim; + struct file *filp; gfp_t gfp = lo->old_gfp_mask; - struct block_device *bdev = lo->lo_device; - int err = 0; - bool partscan = false; - int lo_number; - struct loop_worker *pos, *worker; - - mutex_lock(&lo->lo_mutex); - if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) { - err = -ENXIO; - goto out_unlock; - } - - filp = lo->lo_backing_file; - if (filp == NULL) { - err = -EINVAL; - goto out_unlock; - } - - if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags)) - blk_queue_write_cache(lo->lo_queue, false, false); - - /* freeze request queue during the transition */ - blk_mq_freeze_queue(lo->lo_queue); - - destroy_workqueue(lo->workqueue); - spin_lock_irq(&lo->lo_work_lock); - list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, - idle_list) { - list_del(&worker->idle_list); - rb_erase(&worker->rb_node, &lo->worker_tree); - css_put(worker->blkcg_css); - kfree(worker); - } - spin_unlock_irq(&lo->lo_work_lock); - del_timer_sync(&lo->timer); spin_lock_irq(&lo->lo_lock); + filp = lo->lo_backing_file; lo->lo_backing_file = NULL; spin_unlock_irq(&lo->lo_lock); - loop_release_xfer(lo); - lo->transfer = NULL; - lo->ioctl = NULL; lo->lo_device = NULL; - lo->lo_encryption = NULL; lo->lo_offset = 0; lo->lo_sizelimit = 0; - lo->lo_encrypt_key_size = 0; - memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); - memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); - blk_queue_logical_block_size(lo->lo_queue, 512); - blk_queue_physical_block_size(lo->lo_queue, 512); - blk_queue_io_min(lo->lo_queue, 512); - if (bdev) { - bdput(bdev); - invalidate_bdev(bdev); - bdev->bd_inode->i_mapping->wb_err = 0; - } - set_capacity(lo->lo_disk, 0); + + /* + * Reset the block size to the default. + * + * No queue freezing needed because this is called from the final + * ->release call only, so there can't be any outstanding I/O. + */ + lim = queue_limits_start_update(lo->lo_queue); + lim.logical_block_size = SECTOR_SIZE; + lim.physical_block_size = SECTOR_SIZE; + lim.io_min = SECTOR_SIZE; + queue_limits_commit_update(lo->lo_queue, &lim); + + invalidate_disk(lo->lo_disk); loop_sysfs_exit(lo); - if (bdev) { - /* let user-space know about this change */ - kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); - } + /* let user-space know about this change */ + kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); mapping_set_gfp_mask(filp->f_mapping, gfp); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - blk_mq_unfreeze_queue(lo->lo_queue); - partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev; - lo_number = lo->lo_number; -out_unlock: - mutex_unlock(&lo->lo_mutex); - if (partscan) { + disk_force_media_change(lo->lo_disk); + + if (lo->lo_flags & LO_FLAGS_PARTSCAN) { + int err; + /* * open_mutex has been held already in release path, so don't * acquire it if this function is called in such case. @@ -1360,31 +1162,23 @@ out_unlock: * must be at least one and it can only become zero when the * current holder is released. */ - if (!release) - mutex_lock(&lo->lo_disk->open_mutex); err = bdev_disk_changed(lo->lo_disk, false); - if (!release) - mutex_unlock(&lo->lo_disk->open_mutex); if (err) pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", - __func__, lo_number, err); + __func__, lo->lo_number, err); /* Device is gone, no point in returning error */ - err = 0; } /* * lo->lo_state is set to Lo_unbound here after above partscan has - * finished. - * - * There cannot be anybody else entering __loop_clr_fd() as - * lo->lo_backing_file is already cleared and Lo_rundown state - * protects us from all the other places trying to change the 'lo' - * device. + * finished. There cannot be anybody else entering __loop_clr_fd() as + * Lo_rundown state protects us from all the other places trying to + * change the 'lo' device. */ - mutex_lock(&lo->lo_mutex); lo->lo_flags = 0; if (!part_shift) - lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; + set_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); + mutex_lock(&lo->lo_mutex); lo->lo_state = Lo_unbound; mutex_unlock(&lo->lo_mutex); @@ -1393,61 +1187,54 @@ out_unlock: * lo_mutex triggers a circular lock dependency possibility warning as * fput can take open_mutex which is usually taken before lo_mutex. */ - if (filp) - fput(filp); - return err; + fput(filp); } static int loop_clr_fd(struct loop_device *lo) { int err; - err = mutex_lock_killable(&lo->lo_mutex); + /* + * Since lo_ioctl() is called without locks held, it is possible that + * loop_configure()/loop_change_fd() and loop_clr_fd() run in parallel. + * + * Therefore, use global lock when setting Lo_rundown state in order to + * make sure that loop_validate_file() will fail if the "struct file" + * which loop_configure()/loop_change_fd() found via fget() was this + * loop device. + */ + err = loop_global_lock_killable(lo, true); if (err) return err; if (lo->lo_state != Lo_bound) { - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); return -ENXIO; } /* - * If we've explicitly asked to tear down the loop device, - * and it has an elevated reference count, set it for auto-teardown when - * the last reference goes away. This stops $!~#$@ udev from - * preventing teardown because it decided that it needs to run blkid on - * the loopback device whenever they appear. xfstests is notorious for - * failing tests because blkid via udev races with a losetup - * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d - * command to fail with EBUSY. + * Mark the device for removing the backing device on last close. + * If we are the only opener, also switch the state to roundown here to + * prevent new openers from coming in. */ - if (atomic_read(&lo->lo_refcnt) > 1) { - lo->lo_flags |= LO_FLAGS_AUTOCLEAR; - mutex_unlock(&lo->lo_mutex); - return 0; - } - lo->lo_state = Lo_rundown; - mutex_unlock(&lo->lo_mutex); - return __loop_clr_fd(lo, false); + lo->lo_flags |= LO_FLAGS_AUTOCLEAR; + if (disk_openers(lo->lo_disk) == 1) + lo->lo_state = Lo_rundown; + loop_global_unlock(lo, true); + + return 0; } static int loop_set_status(struct loop_device *lo, const struct loop_info64 *info) { int err; - kuid_t uid = current_uid(); - int prev_lo_flags; bool partscan = false; bool size_changed = false; + unsigned int memflags; err = mutex_lock_killable(&lo->lo_mutex); if (err) return err; - if (lo->lo_encrypt_key_size && - !uid_eq(lo->lo_key_owner, uid) && - !capable(CAP_SYS_ADMIN)) { - err = -EPERM; - goto out_unlock; - } if (lo->lo_state != Lo_bound) { err = -ENXIO; goto out_unlock; @@ -1460,49 +1247,29 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) invalidate_bdev(lo->lo_device); } - /* I/O need to be drained during transfer transition */ - blk_mq_freeze_queue(lo->lo_queue); - - if (size_changed && lo->lo_device->bd_inode->i_mapping->nrpages) { - /* If any pages were dirtied after invalidate_bdev(), try again */ - err = -EAGAIN; - pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n", - __func__, lo->lo_number, lo->lo_file_name, - lo->lo_device->bd_inode->i_mapping->nrpages); - goto out_unfreeze; - } - - prev_lo_flags = lo->lo_flags; + /* I/O needs to be drained before changing lo_offset or lo_sizelimit */ + memflags = blk_mq_freeze_queue(lo->lo_queue); err = loop_set_status_from_info(lo, info); if (err) goto out_unfreeze; - /* Mask out flags that can't be set using LOOP_SET_STATUS. */ - lo->lo_flags &= LOOP_SET_STATUS_SETTABLE_FLAGS; - /* For those flags, use the previous values instead */ - lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_SETTABLE_FLAGS; - /* For flags that can't be cleared, use previous values too */ - lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_CLEARABLE_FLAGS; + partscan = !(lo->lo_flags & LO_FLAGS_PARTSCAN) && + (info->lo_flags & LO_FLAGS_PARTSCAN); - if (size_changed) { - loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit, - lo->lo_backing_file); - loop_set_size(lo, new_size); - } + lo->lo_flags &= ~LOOP_SET_STATUS_CLEARABLE_FLAGS; + lo->lo_flags |= (info->lo_flags & LOOP_SET_STATUS_SETTABLE_FLAGS); - loop_config_discard(lo); - - /* update dio if lo_offset or transfer is changed */ - __loop_update_dio(lo, lo->use_dio); + /* update the direct I/O flag if lo_offset changed */ + loop_update_dio(lo); out_unfreeze: - blk_mq_unfreeze_queue(lo->lo_queue); - - if (!err && (lo->lo_flags & LO_FLAGS_PARTSCAN) && - !(prev_lo_flags & LO_FLAGS_PARTSCAN)) { - lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; - partscan = true; + blk_mq_unfreeze_queue(lo->lo_queue, memflags); + if (partscan) + clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); + if (!err && size_changed) { + loff_t new_size = lo_calculate_size(lo, lo->lo_backing_file); + loop_set_size(lo, new_size); } out_unlock: mutex_unlock(&lo->lo_mutex); @@ -1533,14 +1300,6 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info) info->lo_sizelimit = lo->lo_sizelimit; info->lo_flags = lo->lo_flags; memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE); - memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE); - info->lo_encrypt_type = - lo->lo_encryption ? lo->lo_encryption->number : 0; - if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) { - info->lo_encrypt_key_size = lo->lo_encrypt_key_size; - memcpy(info->lo_encrypt_key, lo->lo_encrypt_key, - lo->lo_encrypt_key_size); - } /* Drop lo_mutex while we call into the filesystem. */ path = lo->lo_backing_file->f_path; @@ -1566,16 +1325,8 @@ loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64) info64->lo_rdevice = info->lo_rdevice; info64->lo_offset = info->lo_offset; info64->lo_sizelimit = 0; - info64->lo_encrypt_type = info->lo_encrypt_type; - info64->lo_encrypt_key_size = info->lo_encrypt_key_size; info64->lo_flags = info->lo_flags; - info64->lo_init[0] = info->lo_init[0]; - info64->lo_init[1] = info->lo_init[1]; - if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI) - memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE); - else - memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE); - memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE); + memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE); } static int @@ -1587,16 +1338,8 @@ loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info) info->lo_inode = info64->lo_inode; info->lo_rdevice = info64->lo_rdevice; info->lo_offset = info64->lo_offset; - info->lo_encrypt_type = info64->lo_encrypt_type; - info->lo_encrypt_key_size = info64->lo_encrypt_key_size; info->lo_flags = info64->lo_flags; - info->lo_init[0] = info64->lo_init[0]; - info->lo_init[1] = info64->lo_init[1]; - if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI) - memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE); - else - memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE); - memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE); + memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE); /* error in case values were truncated */ if (info->lo_device != info64->lo_device || @@ -1668,7 +1411,7 @@ static int loop_set_capacity(struct loop_device *lo) if (unlikely(lo->lo_state != Lo_bound)) return -ENXIO; - size = get_loop_size(lo, lo->lo_backing_file); + size = lo_calculate_size(lo, lo->lo_backing_file); loop_set_size(lo, size); return 0; @@ -1676,53 +1419,75 @@ static int loop_set_capacity(struct loop_device *lo) static int loop_set_dio(struct loop_device *lo, unsigned long arg) { - int error = -ENXIO; - if (lo->lo_state != Lo_bound) - goto out; + bool use_dio = !!arg; + unsigned int memflags; - __loop_update_dio(lo, !!arg); - if (lo->use_dio == !!arg) + if (lo->lo_state != Lo_bound) + return -ENXIO; + if (use_dio == !!(lo->lo_flags & LO_FLAGS_DIRECT_IO)) return 0; - error = -EINVAL; - out: - return error; + + if (use_dio) { + if (!lo_can_use_dio(lo)) + return -EINVAL; + /* flush dirty pages before starting to use direct I/O */ + vfs_fsync(lo->lo_backing_file, 0); + } + + memflags = blk_mq_freeze_queue(lo->lo_queue); + if (use_dio) + lo->lo_flags |= LO_FLAGS_DIRECT_IO; + else + lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; + blk_mq_unfreeze_queue(lo->lo_queue, memflags); + return 0; } -static int loop_set_block_size(struct loop_device *lo, unsigned long arg) +static int loop_set_block_size(struct loop_device *lo, blk_mode_t mode, + struct block_device *bdev, unsigned long arg) { + struct queue_limits lim; + unsigned int memflags; int err = 0; - if (lo->lo_state != Lo_bound) - return -ENXIO; + /* + * If we don't hold exclusive handle for the device, upgrade to it + * here to avoid changing device under exclusive owner. + */ + if (!(mode & BLK_OPEN_EXCL)) { + err = bd_prepare_to_claim(bdev, loop_set_block_size, NULL); + if (err) + return err; + } - err = loop_validate_block_size(arg); + err = mutex_lock_killable(&lo->lo_mutex); if (err) - return err; + goto abort_claim; + + if (lo->lo_state != Lo_bound) { + err = -ENXIO; + goto unlock; + } if (lo->lo_queue->limits.logical_block_size == arg) - return 0; + goto unlock; sync_blockdev(lo->lo_device); invalidate_bdev(lo->lo_device); - blk_mq_freeze_queue(lo->lo_queue); - - /* invalidate_bdev should have truncated all the pages */ - if (lo->lo_device->bd_inode->i_mapping->nrpages) { - err = -EAGAIN; - pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n", - __func__, lo->lo_number, lo->lo_file_name, - lo->lo_device->bd_inode->i_mapping->nrpages); - goto out_unfreeze; - } + lim = queue_limits_start_update(lo->lo_queue); + loop_update_limits(lo, &lim, arg); - blk_queue_logical_block_size(lo->lo_queue, arg); - blk_queue_physical_block_size(lo->lo_queue, arg); - blk_queue_io_min(lo->lo_queue, arg); + memflags = blk_mq_freeze_queue(lo->lo_queue); + err = queue_limits_commit_update(lo->lo_queue, &lim); loop_update_dio(lo); -out_unfreeze: - blk_mq_unfreeze_queue(lo->lo_queue); + blk_mq_unfreeze_queue(lo->lo_queue, memflags); +unlock: + mutex_unlock(&lo->lo_mutex); +abort_claim: + if (!(mode & BLK_OPEN_EXCL)) + bd_abort_claiming(bdev, loop_set_block_size); return err; } @@ -1741,17 +1506,14 @@ static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd, case LOOP_SET_DIRECT_IO: err = loop_set_dio(lo, arg); break; - case LOOP_SET_BLOCK_SIZE: - err = loop_set_block_size(lo, arg); - break; default: - err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL; + err = -EINVAL; } mutex_unlock(&lo->lo_mutex); return err; } -static int lo_ioctl(struct block_device *bdev, fmode_t mode, +static int lo_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct loop_device *lo = bdev->bd_disk->private_data; @@ -1786,24 +1548,25 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, return loop_clr_fd(lo); case LOOP_SET_STATUS: err = -EPERM; - if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) { + if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN)) err = loop_set_status_old(lo, argp); - } break; case LOOP_GET_STATUS: return loop_get_status_old(lo, argp); case LOOP_SET_STATUS64: err = -EPERM; - if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) { + if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN)) err = loop_set_status64(lo, argp); - } break; case LOOP_GET_STATUS64: return loop_get_status64(lo, argp); + case LOOP_SET_BLOCK_SIZE: + if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + return loop_set_block_size(lo, mode, bdev, arg); case LOOP_SET_CAPACITY: case LOOP_SET_DIRECT_IO: - case LOOP_SET_BLOCK_SIZE: - if (!(mode & FMODE_WRITE) && !capable(CAP_SYS_ADMIN)) + if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN)) return -EPERM; fallthrough; default: @@ -1821,7 +1584,7 @@ struct compat_loop_info { compat_ulong_t lo_inode; /* ioctl r/o */ compat_dev_t lo_rdevice; /* ioctl r/o */ compat_int_t lo_offset; - compat_int_t lo_encrypt_type; + compat_int_t lo_encrypt_type; /* obsolete, ignored */ compat_int_t lo_encrypt_key_size; /* ioctl w/o */ compat_int_t lo_flags; /* ioctl r/o */ char lo_name[LO_NAME_SIZE]; @@ -1850,16 +1613,8 @@ loop_info64_from_compat(const struct compat_loop_info __user *arg, info64->lo_rdevice = info.lo_rdevice; info64->lo_offset = info.lo_offset; info64->lo_sizelimit = 0; - info64->lo_encrypt_type = info.lo_encrypt_type; - info64->lo_encrypt_key_size = info.lo_encrypt_key_size; info64->lo_flags = info.lo_flags; - info64->lo_init[0] = info.lo_init[0]; - info64->lo_init[1] = info.lo_init[1]; - if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI) - memcpy(info64->lo_crypt_name, info.lo_name, LO_NAME_SIZE); - else - memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE); - memcpy(info64->lo_encrypt_key, info.lo_encrypt_key, LO_KEY_SIZE); + memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE); return 0; } @@ -1879,24 +1634,14 @@ loop_info64_to_compat(const struct loop_info64 *info64, info.lo_inode = info64->lo_inode; info.lo_rdevice = info64->lo_rdevice; info.lo_offset = info64->lo_offset; - info.lo_encrypt_type = info64->lo_encrypt_type; - info.lo_encrypt_key_size = info64->lo_encrypt_key_size; info.lo_flags = info64->lo_flags; - info.lo_init[0] = info64->lo_init[0]; - info.lo_init[1] = info64->lo_init[1]; - if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI) - memcpy(info.lo_name, info64->lo_crypt_name, LO_NAME_SIZE); - else - memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE); - memcpy(info.lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE); + memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE); /* error in case values were truncated */ if (info.lo_device != info64->lo_device || info.lo_rdevice != info64->lo_rdevice || info.lo_inode != info64->lo_inode || - info.lo_offset != info64->lo_offset || - info.lo_init[0] != info64->lo_init[0] || - info.lo_init[1] != info64->lo_init[1]) + info.lo_offset != info64->lo_offset) return -EOVERFLOW; if (copy_to_user(arg, &info, sizeof(info))) @@ -1932,7 +1677,7 @@ loop_get_status_compat(struct loop_device *lo, return err; } -static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, +static int lo_compat_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct loop_device *lo = bdev->bd_disk->private_data; @@ -1968,112 +1713,141 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, } #endif -static int lo_open(struct block_device *bdev, fmode_t mode) +static int lo_open(struct gendisk *disk, blk_mode_t mode) { - struct loop_device *lo = bdev->bd_disk->private_data; + struct loop_device *lo = disk->private_data; int err; err = mutex_lock_killable(&lo->lo_mutex); if (err) return err; - if (lo->lo_state == Lo_deleting) + + if (lo->lo_state == Lo_deleting || lo->lo_state == Lo_rundown) err = -ENXIO; - else - atomic_inc(&lo->lo_refcnt); mutex_unlock(&lo->lo_mutex); return err; } -static void lo_release(struct gendisk *disk, fmode_t mode) +static void lo_release(struct gendisk *disk) { struct loop_device *lo = disk->private_data; + bool need_clear = false; - mutex_lock(&lo->lo_mutex); - if (atomic_dec_return(&lo->lo_refcnt)) - goto out_unlock; + if (disk_openers(disk) > 0) + return; + /* + * Clear the backing device information if this is the last close of + * a device that's been marked for auto clear, or on which LOOP_CLR_FD + * has been called. + */ - if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) { - if (lo->lo_state != Lo_bound) - goto out_unlock; + mutex_lock(&lo->lo_mutex); + if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) lo->lo_state = Lo_rundown; - mutex_unlock(&lo->lo_mutex); - /* - * In autoclear mode, stop the loop thread - * and remove configuration after last close. - */ - __loop_clr_fd(lo, true); - return; - } else if (lo->lo_state == Lo_bound) { - /* - * Otherwise keep thread (if running) and config, - * but flush possible ongoing bios in thread. - */ - blk_mq_freeze_queue(lo->lo_queue); - blk_mq_unfreeze_queue(lo->lo_queue); - } -out_unlock: + need_clear = (lo->lo_state == Lo_rundown); mutex_unlock(&lo->lo_mutex); + + if (need_clear) + __loop_clr_fd(lo); +} + +static void lo_free_disk(struct gendisk *disk) +{ + struct loop_device *lo = disk->private_data; + + if (lo->workqueue) + destroy_workqueue(lo->workqueue); + loop_free_idle_workers(lo, true); + timer_shutdown_sync(&lo->timer); + mutex_destroy(&lo->lo_mutex); + kfree(lo); } static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, - .open = lo_open, + .open = lo_open, .release = lo_release, .ioctl = lo_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = lo_compat_ioctl, #endif + .free_disk = lo_free_disk, }; /* * And now the modules code and kernel interface. */ -static int max_loop; -module_param(max_loop, int, 0444); -MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); -module_param(max_part, int, 0444); -MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); -int loop_register_transfer(struct loop_func_table *funcs) -{ - unsigned int n = funcs->number; +/* + * If max_loop is specified, create that many devices upfront. + * This also becomes a hard limit. If max_loop is not specified, + * the default isn't a hard limit (as before commit 85c50197716c + * changed the default value from 0 for max_loop=0 reasons), just + * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module + * init time. Loop devices can be requested on-demand with the + * /dev/loop-control interface, or be instantiated by accessing + * a 'dead' device node. + */ +static int max_loop = CONFIG_BLK_DEV_LOOP_MIN_COUNT; - if (n >= MAX_LO_CRYPT || xfer_funcs[n]) - return -EINVAL; - xfer_funcs[n] = funcs; - return 0; -} +#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD +static bool max_loop_specified; -static int unregister_transfer_cb(int id, void *ptr, void *data) +static int max_loop_param_set_int(const char *val, + const struct kernel_param *kp) { - struct loop_device *lo = ptr; - struct loop_func_table *xfer = data; + int ret; - mutex_lock(&lo->lo_mutex); - if (lo->lo_encryption == xfer) - loop_release_xfer(lo); - mutex_unlock(&lo->lo_mutex); + ret = param_set_int(val, kp); + if (ret < 0) + return ret; + + max_loop_specified = true; return 0; } -int loop_unregister_transfer(int number) +static const struct kernel_param_ops max_loop_param_ops = { + .set = max_loop_param_set_int, + .get = param_get_int, +}; + +module_param_cb(max_loop, &max_loop_param_ops, &max_loop, 0444); +MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); +#else +module_param(max_loop, int, 0444); +MODULE_PARM_DESC(max_loop, "Initial number of loop devices"); +#endif + +module_param(max_part, int, 0444); +MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); + +static int hw_queue_depth = LOOP_DEFAULT_HW_Q_DEPTH; + +static int loop_set_hw_queue_depth(const char *s, const struct kernel_param *p) { - unsigned int n = number; - struct loop_func_table *xfer; + int qd, ret; - if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL) + ret = kstrtoint(s, 0, &qd); + if (ret < 0) + return ret; + if (qd < 1) return -EINVAL; - - xfer_funcs[n] = NULL; - idr_for_each(&loop_index_idr, &unregister_transfer_cb, xfer); + hw_queue_depth = qd; return 0; } -EXPORT_SYMBOL(loop_register_transfer); -EXPORT_SYMBOL(loop_unregister_transfer); +static const struct kernel_param_ops loop_hw_qdepth_param_ops = { + .set = loop_set_hw_queue_depth, + .get = param_get_int, +}; + +device_param_cb(hw_queue_depth, &loop_hw_qdepth_param_ops, &hw_queue_depth, 0444); +MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: " __stringify(LOOP_DEFAULT_HW_Q_DEPTH)); + +MODULE_DESCRIPTION("Loopback device support"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) @@ -2094,7 +1868,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, cmd->use_aio = false; break; default: - cmd->use_aio = lo->use_dio; + cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO; break; } @@ -2102,12 +1876,14 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, cmd->blkcg_css = NULL; cmd->memcg_css = NULL; #ifdef CONFIG_BLK_CGROUP - if (rq->bio && rq->bio->bi_blkg) { - cmd->blkcg_css = &bio_blkcg(rq->bio)->css; + if (rq->bio) { + cmd->blkcg_css = bio_blkcg_css(rq->bio); #ifdef CONFIG_MEMCG - cmd->memcg_css = - cgroup_get_e_css(cmd->blkcg_css->cgroup, - &memory_cgrp_subsys); + if (cmd->blkcg_css) { + cmd->memcg_css = + cgroup_get_e_css(cmd->blkcg_css->cgroup, + &memory_cgrp_subsys); + } #endif } #endif @@ -2118,6 +1894,8 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, static void loop_handle_cmd(struct loop_cmd *cmd) { + struct cgroup_subsys_state *cmd_blkcg_css = cmd->blkcg_css; + struct cgroup_subsys_state *cmd_memcg_css = cmd->memcg_css; struct request *rq = blk_mq_rq_from_pdu(cmd); const bool write = op_is_write(req_op(rq)); struct loop_device *lo = rq->q->queuedata; @@ -2129,24 +1907,34 @@ static void loop_handle_cmd(struct loop_cmd *cmd) goto failed; } - if (cmd->blkcg_css) - kthread_associate_blkcg(cmd->blkcg_css); - if (cmd->memcg_css) + /* We can block in this context, so ignore REQ_NOWAIT. */ + if (rq->cmd_flags & REQ_NOWAIT) + rq->cmd_flags &= ~REQ_NOWAIT; + + if (cmd_blkcg_css) + kthread_associate_blkcg(cmd_blkcg_css); + if (cmd_memcg_css) old_memcg = set_active_memcg( - mem_cgroup_from_css(cmd->memcg_css)); + mem_cgroup_from_css(cmd_memcg_css)); + /* + * do_req_filebacked() may call blk_mq_complete_request() synchronously + * or asynchronously if using aio. Hence, do not touch 'cmd' after + * do_req_filebacked() has returned unless we are sure that 'cmd' has + * not yet been completed. + */ ret = do_req_filebacked(lo, rq); - if (cmd->blkcg_css) + if (cmd_blkcg_css) kthread_associate_blkcg(NULL); - if (cmd->memcg_css) { + if (cmd_memcg_css) { set_active_memcg(old_memcg); - css_put(cmd->memcg_css); + css_put(cmd_memcg_css); } failed: /* complete non-aio request */ - if (!cmd->use_aio || ret) { + if (ret != -EIOCBQUEUED) { if (ret == -EOPNOTSUPP) cmd->ret = ret; else @@ -2156,11 +1944,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd) } } -static void loop_set_timer(struct loop_device *lo) -{ - timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT); -} - static void loop_process_work(struct loop_worker *worker, struct list_head *cmd_list, struct loop_device *lo) { @@ -2209,27 +1992,6 @@ static void loop_rootcg_workfn(struct work_struct *work) loop_process_work(NULL, &lo->rootcg_cmd_list, lo); } -static void loop_free_idle_workers(struct timer_list *timer) -{ - struct loop_device *lo = container_of(timer, struct loop_device, timer); - struct loop_worker *pos, *worker; - - spin_lock_irq(&lo->lo_work_lock); - list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, - idle_list) { - if (time_is_after_jiffies(worker->last_ran_at + - LOOP_IDLE_WORKER_TIMEOUT)) - break; - list_del(&worker->idle_list); - rb_erase(&worker->rb_node, &lo->worker_tree); - css_put(worker->blkcg_css); - kfree(worker); - } - if (!list_empty(&lo->idle_worker_list)) - loop_set_timer(lo); - spin_unlock_irq(&lo->lo_work_lock); -} - static const struct blk_mq_ops loop_mq_ops = { .queue_rq = loop_queue_rq, .complete = lo_complete_rq, @@ -2237,6 +1999,12 @@ static const struct blk_mq_ops loop_mq_ops = { static int loop_add(int i) { + struct queue_limits lim = { + /* + * Random number picked from the historic block max_sectors cap. + */ + .max_hw_sectors = 2560u, + }; struct loop_device *lo; struct gendisk *disk; int err; @@ -2245,6 +2013,9 @@ static int loop_add(int i) lo = kzalloc(sizeof(*lo), GFP_KERNEL); if (!lo) goto out; + lo->worker_tree = RB_ROOT; + INIT_LIST_HEAD(&lo->idle_worker_list); + timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE); lo->lo_state = Lo_unbound; err = mutex_lock_killable(&loop_ctl_mutex); @@ -2259,40 +2030,30 @@ static int loop_add(int i) } else { err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL); } + mutex_unlock(&loop_ctl_mutex); if (err < 0) - goto out_unlock; + goto out_free_dev; i = err; - err = -ENOMEM; lo->tag_set.ops = &loop_mq_ops; lo->tag_set.nr_hw_queues = 1; - lo->tag_set.queue_depth = 128; + lo->tag_set.queue_depth = hw_queue_depth; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; + lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); if (err) goto out_free_idr; - disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, lo); + disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, &lim, lo); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_cleanup_tags; } lo->lo_queue = lo->lo_disk->queue; - blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS); - - /* - * By default, we do buffer IO, so it doesn't make sense to enable - * merge because the I/O submitted to backing file is handled page by - * page. For directio mode, merge does help to dispatch bigger request - * to underlayer disk. We will enable merge once directio is enabled. - */ - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue); - /* * Disable partition scanning by default. The in-kernel partition * scanning can be requested individually per-device during its @@ -2312,29 +2073,41 @@ static int loop_add(int i) * userspace tools. Parameters like this in general should be avoided. */ if (!part_shift) - disk->flags |= GENHD_FL_NO_PART_SCAN; - disk->flags |= GENHD_FL_EXT_DEVT; - atomic_set(&lo->lo_refcnt, 0); + set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); mutex_init(&lo->lo_mutex); lo->lo_number = i; spin_lock_init(&lo->lo_lock); spin_lock_init(&lo->lo_work_lock); + INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); + INIT_LIST_HEAD(&lo->rootcg_cmd_list); disk->major = LOOP_MAJOR; disk->first_minor = i << part_shift; disk->minors = 1 << part_shift; disk->fops = &lo_fops; disk->private_data = lo; disk->queue = lo->lo_queue; + disk->events = DISK_EVENT_MEDIA_CHANGE; + disk->event_flags = DISK_EVENT_FLAG_UEVENT; sprintf(disk->disk_name, "loop%d", i); - add_disk(disk); + /* Make this loop device reachable from pathname. */ + err = add_disk(disk); + if (err) + goto out_cleanup_disk; + + /* Show this loop device. */ + mutex_lock(&loop_ctl_mutex); + lo->idr_visible = true; mutex_unlock(&loop_ctl_mutex); + return i; +out_cleanup_disk: + put_disk(disk); out_cleanup_tags: blk_mq_free_tag_set(&lo->tag_set); out_free_idr: + mutex_lock(&loop_ctl_mutex); idr_remove(&loop_index_idr, i); -out_unlock: mutex_unlock(&loop_ctl_mutex); out_free_dev: kfree(lo); @@ -2344,21 +2117,29 @@ out: static void loop_remove(struct loop_device *lo) { + /* Make this loop device unreachable from pathname. */ del_gendisk(lo->lo_disk); - blk_cleanup_disk(lo->lo_disk); blk_mq_free_tag_set(&lo->tag_set); - mutex_destroy(&lo->lo_mutex); - kfree(lo); + + mutex_lock(&loop_ctl_mutex); + idr_remove(&loop_index_idr, lo->lo_number); + mutex_unlock(&loop_ctl_mutex); + + put_disk(lo->lo_disk); } +#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD static void loop_probe(dev_t dev) { int idx = MINOR(dev) >> part_shift; - if (max_loop && idx >= max_loop) + if (max_loop_specified && max_loop && idx >= max_loop) return; loop_add(idx); } +#else +#define loop_probe NULL +#endif /* !CONFIG_BLOCK_LEGACY_AUTOLOAD */ static int loop_control_remove(int idx) { @@ -2366,35 +2147,43 @@ static int loop_control_remove(int idx) int ret; if (idx < 0) { - pr_warn("deleting an unspecified loop device is not supported.\n"); + pr_warn_once("deleting an unspecified loop device is not supported.\n"); return -EINVAL; } + /* Hide this loop device for serialization. */ ret = mutex_lock_killable(&loop_ctl_mutex); if (ret) return ret; - lo = idr_find(&loop_index_idr, idx); - if (!lo) { + if (!lo || !lo->idr_visible) ret = -ENODEV; - goto out_unlock_ctrl; - } + else + lo->idr_visible = false; + mutex_unlock(&loop_ctl_mutex); + if (ret) + return ret; + /* Check whether this loop device can be removed. */ ret = mutex_lock_killable(&lo->lo_mutex); if (ret) - goto out_unlock_ctrl; - if (lo->lo_state != Lo_unbound || - atomic_read(&lo->lo_refcnt) > 0) { + goto mark_visible; + if (lo->lo_state != Lo_unbound || disk_openers(lo->lo_disk) > 0) { mutex_unlock(&lo->lo_mutex); ret = -EBUSY; - goto out_unlock_ctrl; + goto mark_visible; } + /* Mark this loop device as no more bound, but not quite unbound yet */ lo->lo_state = Lo_deleting; mutex_unlock(&lo->lo_mutex); - idr_remove(&loop_index_idr, lo->lo_number); loop_remove(lo); -out_unlock_ctrl: + return 0; + +mark_visible: + /* Show this loop device again. */ + mutex_lock(&loop_ctl_mutex); + lo->idr_visible = true; mutex_unlock(&loop_ctl_mutex); return ret; } @@ -2408,7 +2197,8 @@ static int loop_control_get_free(int idx) if (ret) return ret; idr_for_each_entry(&loop_index_idr, lo, id) { - if (lo->lo_state == Lo_unbound) + /* Hitting a race results in creating a new loop device which is harmless. */ + if (lo->idr_visible && data_race(lo->lo_state) == Lo_unbound) goto found; } mutex_unlock(&loop_ctl_mutex); @@ -2452,7 +2242,7 @@ MODULE_ALIAS("devname:loop-control"); static int __init loop_init(void) { - int i, nr; + int i; int err; part_shift = 0; @@ -2480,19 +2270,6 @@ static int __init loop_init(void) goto err_out; } - /* - * If max_loop is specified, create that many devices upfront. - * This also becomes a hard limit. If max_loop is not specified, - * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module - * init time. Loop devices can be requested on-demand with the - * /dev/loop-control interface, or be instantiated by accessing - * a 'dead' device node. - */ - if (max_loop) - nr = max_loop; - else - nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT; - err = misc_register(&loop_misc); if (err < 0) goto err_out; @@ -2504,7 +2281,7 @@ static int __init loop_init(void) } /* pre-create number of devices given by config or max_loop */ - for (i = 0; i < nr; i++) + for (i = 0; i < max_loop; i++) loop_add(i); printk(KERN_INFO "loop: module loaded\n"); @@ -2524,10 +2301,14 @@ static void __exit loop_exit(void) unregister_blkdev(LOOP_MAJOR, "loop"); misc_deregister(&loop_misc); - mutex_lock(&loop_ctl_mutex); + /* + * There is no need to use loop_ctl_mutex here, for nobody else can + * access loop_index_idr when this module is unloading (unless forced + * module unloading is requested). If this is not a clean unloading, + * we have no means to avoid kernel crash. + */ idr_for_each_entry(&loop_index_idr, lo, id) loop_remove(lo); - mutex_unlock(&loop_ctl_mutex); idr_destroy(&loop_index_idr); } @@ -2539,6 +2320,9 @@ module_exit(loop_exit); static int __init max_loop_setup(char *str) { max_loop = simple_strtol(str, NULL, 0); +#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD + max_loop_specified = true; +#endif return 1; } |
