diff options
Diffstat (limited to 'drivers/block/loop.c')
| -rw-r--r-- | drivers/block/loop.c | 1284 |
1 files changed, 661 insertions, 623 deletions
diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 01cbbfc4e9e2..272bc608e528 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1,54 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * linux/drivers/block/loop.c - * - * Written by Theodore Ts'o, 3/29/93 - * - * Copyright 1993 by Theodore Ts'o. Redistribution of this file is - * permitted under the GNU General Public License. - * - * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993 - * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996 - * - * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994 - * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996 - * - * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997 - * - * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998 - * - * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998 - * - * Loadable modules and other fixes by AK, 1998 - * - * Make real block number available to downstream transfer functions, enables - * CBC (and relatives) mode encryption requiring unique IVs per data block. - * Reed H. Petty, rhp@draper.net - * - * Maximum number of loop devices now dynamic via max_loop module parameter. - * Russell Kroll <rkroll@exploits.org> 19990701 - * - * Maximum number of loop devices when compiled-in now selectable by passing - * max_loop=<1-255> to the kernel on boot. - * Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999 - * - * Completely rewrite request handling to be make_request_fn style and - * non blocking, pushing work to a helper thread. Lots of fixes from - * Al Viro too. - * Jens Axboe <axboe@suse.de>, Nov 2000 - * - * Support up to 256 loop devices - * Heinz Mauelshagen <mge@sistina.com>, Feb 2002 - * - * Support for falling back on the write file operation when the address space - * operations write_begin is not available on the backing filesystem. - * Anton Altaparmakov, 16 Feb 2005 - * - * Still To Fix: - * - Advisory locking is ignored here. - * - Should use an own CAP_* category instead of CAP_SYS_ADMIN - * + * Copyright 1993 by Theodore Ts'o. */ - #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/sched.h> @@ -59,7 +12,6 @@ #include <linux/errno.h> #include <linux/major.h> #include <linux/wait.h> -#include <linux/blkdev.h> #include <linux/blkpg.h> #include <linux/init.h> #include <linux/swap.h> @@ -79,12 +31,64 @@ #include <linux/ioprio.h> #include <linux/blk-cgroup.h> #include <linux/sched/mm.h> +#include <linux/statfs.h> +#include <linux/uaccess.h> +#include <linux/blk-mq.h> +#include <linux/spinlock.h> +#include <uapi/linux/loop.h> + +/* Possible states of device */ +enum { + Lo_unbound, + Lo_bound, + Lo_rundown, + Lo_deleting, +}; -#include "loop.h" +struct loop_device { + int lo_number; + loff_t lo_offset; + loff_t lo_sizelimit; + int lo_flags; + char lo_file_name[LO_NAME_SIZE]; + + struct file *lo_backing_file; + unsigned int lo_min_dio_size; + struct block_device *lo_device; + + gfp_t old_gfp_mask; + + spinlock_t lo_lock; + int lo_state; + spinlock_t lo_work_lock; + struct workqueue_struct *workqueue; + struct work_struct rootcg_work; + struct list_head rootcg_cmd_list; + struct list_head idle_worker_list; + struct rb_root worker_tree; + struct timer_list timer; + bool sysfs_inited; + + struct request_queue *lo_queue; + struct blk_mq_tag_set tag_set; + struct gendisk *lo_disk; + struct mutex lo_mutex; + bool idr_visible; +}; -#include <linux/uaccess.h> +struct loop_cmd { + struct list_head list_entry; + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ + long ret; + struct kiocb iocb; + struct bio_vec *bvec; + struct cgroup_subsys_state *blkcg_css; + struct cgroup_subsys_state *memcg_css; +}; #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) +#define LOOP_DEFAULT_HW_Q_DEPTH 128 static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_ctl_mutex); @@ -133,20 +137,35 @@ static void loop_global_unlock(struct loop_device *lo, bool global) static int max_part; static int part_shift; -static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file) +static loff_t lo_calculate_size(struct loop_device *lo, struct file *file) { loff_t loopsize; + int ret; - /* Compute loopsize in bytes */ - loopsize = i_size_read(file->f_mapping->host); - if (offset > 0) - loopsize -= offset; + if (S_ISBLK(file_inode(file)->i_mode)) { + loopsize = i_size_read(file->f_mapping->host); + } else { + struct kstat stat; + + /* + * Get the accurate file size. This provides better results than + * cached inode data, particularly for network filesystems where + * metadata may be stale. + */ + ret = vfs_getattr_nosec(&file->f_path, &stat, STATX_SIZE, 0); + if (ret) + return 0; + + loopsize = stat.size; + } + + if (lo->lo_offset > 0) + loopsize -= lo->lo_offset; /* offset is beyond i_size, weird but possible */ if (loopsize < 0) return 0; - - if (sizelimit > 0 && sizelimit < loopsize) - loopsize = sizelimit; + if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize) + loopsize = lo->lo_sizelimit; /* * Unfortunately, if we want to do I/O on the device, * the number of 512-byte sectors has to fit into a sector_t. @@ -154,68 +173,38 @@ static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file) return loopsize >> 9; } -static loff_t get_loop_size(struct loop_device *lo, struct file *file) +/* + * We support direct I/O only if lo_offset is aligned with the logical I/O size + * of backing device, and the logical block size of loop is bigger than that of + * the backing device. + */ +static bool lo_can_use_dio(struct loop_device *lo) { - return get_size(lo->lo_offset, lo->lo_sizelimit, file); + if (!(lo->lo_backing_file->f_mode & FMODE_CAN_ODIRECT)) + return false; + if (queue_logical_block_size(lo->lo_queue) < lo->lo_min_dio_size) + return false; + if (lo->lo_offset & (lo->lo_min_dio_size - 1)) + return false; + return true; } -static void __loop_update_dio(struct loop_device *lo, bool dio) +/* + * Direct I/O can be enabled either by using an O_DIRECT file descriptor, or by + * passing in the LO_FLAGS_DIRECT_IO flag from userspace. It will be silently + * disabled when the device block size is too small or the offset is unaligned. + * + * loop_get_status will always report the effective LO_FLAGS_DIRECT_IO flag and + * not the originally passed in one. + */ +static inline void loop_update_dio(struct loop_device *lo) { - struct file *file = lo->lo_backing_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - unsigned short sb_bsize = 0; - unsigned dio_align = 0; - bool use_dio; - - if (inode->i_sb->s_bdev) { - sb_bsize = bdev_logical_block_size(inode->i_sb->s_bdev); - dio_align = sb_bsize - 1; - } + lockdep_assert_held(&lo->lo_mutex); + WARN_ON_ONCE(lo->lo_state == Lo_bound && + lo->lo_queue->mq_freeze_depth == 0); - /* - * We support direct I/O only if lo_offset is aligned with the - * logical I/O size of backing device, and the logical block - * size of loop is bigger than the backing device's. - * - * TODO: the above condition may be loosed in the future, and - * direct I/O may be switched runtime at that time because most - * of requests in sane applications should be PAGE_SIZE aligned - */ - if (dio) { - if (queue_logical_block_size(lo->lo_queue) >= sb_bsize && - !(lo->lo_offset & dio_align) && - mapping->a_ops->direct_IO) - use_dio = true; - else - use_dio = false; - } else { - use_dio = false; - } - - if (lo->use_dio == use_dio) - return; - - /* flush dirty pages before changing direct IO */ - vfs_fsync(file, 0); - - /* - * The flag of LO_FLAGS_DIRECT_IO is handled similarly with - * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup - * will get updated by ioctl(LOOP_GET_STATUS) - */ - if (lo->lo_state == Lo_bound) - blk_mq_freeze_queue(lo->lo_queue); - lo->use_dio = use_dio; - if (use_dio) { - blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, lo->lo_queue); - lo->lo_flags |= LO_FLAGS_DIRECT_IO; - } else { - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue); + if ((lo->lo_flags & LO_FLAGS_DIRECT_IO) && !lo_can_use_dio(lo)) lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; - } - if (lo->lo_state == Lo_bound) - blk_mq_unfreeze_queue(lo->lo_queue); } /** @@ -232,72 +221,26 @@ static void loop_set_size(struct loop_device *lo, loff_t size) kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); } -static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos) -{ - struct iov_iter i; - ssize_t bw; - - iov_iter_bvec(&i, WRITE, bvec, 1, bvec->bv_len); - - file_start_write(file); - bw = vfs_iter_write(file, &i, ppos, 0); - file_end_write(file); - - if (likely(bw == bvec->bv_len)) - return 0; - - printk_ratelimited(KERN_ERR - "loop: Write error at byte offset %llu, length %i.\n", - (unsigned long long)*ppos, bvec->bv_len); - if (bw >= 0) - bw = -EIO; - return bw; -} - -static int lo_write_simple(struct loop_device *lo, struct request *rq, - loff_t pos) -{ - struct bio_vec bvec; - struct req_iterator iter; - int ret = 0; - - rq_for_each_segment(bvec, rq, iter) { - ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos); - if (ret < 0) - break; - cond_resched(); - } - - return ret; -} - -static int lo_read_simple(struct loop_device *lo, struct request *rq, - loff_t pos) +static void loop_clear_limits(struct loop_device *lo, int mode) { - struct bio_vec bvec; - struct req_iterator iter; - struct iov_iter i; - ssize_t len; + struct queue_limits lim = queue_limits_start_update(lo->lo_queue); - rq_for_each_segment(bvec, rq, iter) { - iov_iter_bvec(&i, READ, &bvec, 1, bvec.bv_len); - len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); - if (len < 0) - return len; + if (mode & FALLOC_FL_ZERO_RANGE) + lim.max_write_zeroes_sectors = 0; - flush_dcache_page(bvec.bv_page); - - if (len != bvec.bv_len) { - struct bio *bio; - - __rq_for_each_bio(bio, rq) - zero_fill_bio(bio); - break; - } - cond_resched(); + if (mode & FALLOC_FL_PUNCH_HOLE) { + lim.max_hw_discard_sectors = 0; + lim.discard_granularity = 0; } - return 0; + /* + * XXX: this updates the queue limits without freezing the queue, which + * is against the locking protocol and dangerous. But we can't just + * freeze the queue as we're inside the ->queue_rq method here. So this + * should move out into a workqueue unless we get the file operations to + * advertise if they support specific fallocate operations. + */ + queue_limits_commit_update(lo->lo_queue, &lim); } static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, @@ -308,27 +251,30 @@ static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, * a.k.a. discard/zerorange. */ struct file *file = lo->lo_backing_file; - struct request_queue *q = lo->lo_queue; int ret; mode |= FALLOC_FL_KEEP_SIZE; - if (!blk_queue_discard(q)) { - ret = -EOPNOTSUPP; - goto out; - } + if (!bdev_max_discard_sectors(lo->lo_device)) + return -EOPNOTSUPP; ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq)); if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) - ret = -EIO; - out: + return -EIO; + + /* + * We initially configure the limits in a hope that fallocate is + * supported and clear them here if that turns out not to be true. + */ + if (unlikely(ret == -EOPNOTSUPP)) + loop_clear_limits(lo, mode); + return ret; } static int lo_req_flush(struct loop_device *lo, struct request *rq) { - struct file *file = lo->lo_backing_file; - int ret = vfs_fsync(file, 0); + int ret = vfs_fsync(lo->lo_backing_file, 0); if (unlikely(ret && ret != -EINVAL)) ret = -EIO; @@ -340,7 +286,7 @@ static void lo_complete_rq(struct request *rq) struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); blk_status_t ret = BLK_STS_OK; - if (!cmd->use_aio || cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) || + if (cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) || req_op(rq) != REQ_OP_READ) { if (cmd->ret < 0) ret = errno_to_blk_status(cmd->ret); @@ -356,14 +302,13 @@ static void lo_complete_rq(struct request *rq) cmd->ret = 0; blk_mq_requeue_request(rq, true); } else { - if (cmd->use_aio) { - struct bio *bio = rq->bio; + struct bio *bio = rq->bio; - while (bio) { - zero_fill_bio(bio); - bio = bio->bi_next; - } + while (bio) { + zero_fill_bio(bio); + bio = bio->bi_next; } + ret = BLK_STS_IOERR; end_io: blk_mq_end_request(rq, ret); @@ -378,6 +323,8 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd) return; kfree(cmd->bvec); cmd->bvec = NULL; + if (req_op(rq) == REQ_OP_WRITE) + kiocb_end_write(&cmd->iocb); if (likely(!blk_should_fake_timeout(rq->q))) blk_mq_complete_request(rq); } @@ -391,7 +338,7 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret) } static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, - loff_t pos, bool rw) + loff_t pos, int rw) { struct iov_iter iter; struct req_iterator rq_iter; @@ -401,11 +348,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, struct file *file = lo->lo_backing_file; struct bio_vec tmp; unsigned int offset; - int nr_bvec = 0; + unsigned int nr_bvec; int ret; - rq_for_each_bvec(tmp, rq, rq_iter) - nr_bvec++; + nr_bvec = blk_rq_nr_bvec(rq); if (rq->bio != rq->biotail) { @@ -443,20 +389,26 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, cmd->iocb.ki_pos = pos; cmd->iocb.ki_filp = file; - cmd->iocb.ki_complete = lo_rw_aio_complete; - cmd->iocb.ki_flags = IOCB_DIRECT; - cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); + cmd->iocb.ki_ioprio = req_get_ioprio(rq); + if (cmd->use_aio) { + cmd->iocb.ki_complete = lo_rw_aio_complete; + cmd->iocb.ki_flags = IOCB_DIRECT; + } else { + cmd->iocb.ki_complete = NULL; + cmd->iocb.ki_flags = 0; + } - if (rw == WRITE) - ret = call_write_iter(file, &cmd->iocb, &iter); - else - ret = call_read_iter(file, &cmd->iocb, &iter); + if (rw == ITER_SOURCE) { + kiocb_start_write(&cmd->iocb); + ret = file->f_op->write_iter(&cmd->iocb, &iter); + } else + ret = file->f_op->read_iter(&cmd->iocb, &iter); lo_rw_aio_do_completion(cmd); if (ret != -EIOCBQUEUED) lo_rw_aio_complete(&cmd->iocb, ret); - return 0; + return -EIOCBQUEUED; } static int do_req_filebacked(struct loop_device *lo, struct request *rq) @@ -464,15 +416,6 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; - /* - * lo_write_simple and lo_read_simple should have been covered - * by io submit style function like lo_rw_aio(), one blocker - * is that lo_read_simple() need to call flush_dcache_page after - * the page is written from kernel, and it isn't easy to handle - * this in io submit style function which submits all segments - * of the req at one time. And direct read IO doesn't need to - * run flush_dcache_page(). - */ switch (req_op(rq)) { case REQ_OP_FLUSH: return lo_req_flush(lo, rq); @@ -488,27 +431,15 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) case REQ_OP_DISCARD: return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE); case REQ_OP_WRITE: - if (cmd->use_aio) - return lo_rw_aio(lo, cmd, pos, WRITE); - else - return lo_write_simple(lo, rq, pos); + return lo_rw_aio(lo, cmd, pos, ITER_SOURCE); case REQ_OP_READ: - if (cmd->use_aio) - return lo_rw_aio(lo, cmd, pos, READ); - else - return lo_read_simple(lo, rq, pos); + return lo_rw_aio(lo, cmd, pos, ITER_DEST); default: WARN_ON_ONCE(1); return -EIO; } } -static inline void loop_update_dio(struct loop_device *lo) -{ - __loop_update_dio(lo, (lo->lo_backing_file->f_flags & O_DIRECT) | - lo->use_dio); -} - static void loop_reread_partitions(struct loop_device *lo) { int rc; @@ -521,6 +452,28 @@ static void loop_reread_partitions(struct loop_device *lo) __func__, lo->lo_number, lo->lo_file_name, rc); } +static unsigned int loop_query_min_dio_size(struct loop_device *lo) +{ + struct file *file = lo->lo_backing_file; + struct block_device *sb_bdev = file->f_mapping->host->i_sb->s_bdev; + struct kstat st; + + /* + * Use the minimal dio alignment of the file system if provided. + */ + if (!vfs_getattr(&file->f_path, &st, STATX_DIOALIGN, 0) && + (st.result_mask & STATX_DIOALIGN)) + return st.dio_offset_align; + + /* + * In a perfect world this wouldn't be needed, but as of Linux 6.13 only + * a handful of file systems support the STATX_DIOALIGN flag. + */ + if (sb_bdev) + return bdev_logical_block_size(sb_bdev); + return SECTOR_SIZE; +} + static inline int is_loop_device(struct file *file) { struct inode *i = file->f_mapping->host; @@ -553,6 +506,28 @@ static int loop_validate_file(struct file *file, struct block_device *bdev) return 0; } +static void loop_assign_backing_file(struct loop_device *lo, struct file *file) +{ + lo->lo_backing_file = file; + lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping); + mapping_set_gfp_mask(file->f_mapping, + lo->old_gfp_mask & ~(__GFP_IO | __GFP_FS)); + if (lo->lo_backing_file->f_flags & O_DIRECT) + lo->lo_flags |= LO_FLAGS_DIRECT_IO; + lo->lo_min_dio_size = loop_query_min_dio_size(lo); +} + +static int loop_check_backing_file(struct file *file) +{ + if (!file->f_op->read_iter) + return -EINVAL; + + if ((file->f_mode & FMODE_WRITE) && !file->f_op->write_iter) + return -EINVAL; + + return 0; +} + /* * loop_change_fd switched the backing store of a loopback device to * a new file. This is useful for operating system installers to free up @@ -566,12 +541,23 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, { struct file *file = fget(arg); struct file *old_file; + unsigned int memflags; int error; bool partscan; bool is_loop; if (!file) return -EBADF; + + error = loop_check_backing_file(file); + if (error) { + fput(file); + return error; + } + + /* suppress uevents while reconfiguring the device */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); + is_loop = is_loop_device(file); error = loop_global_lock_killable(lo, is_loop); if (error) @@ -594,19 +580,23 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, error = -EINVAL; /* size of the new backing store needs to be the same */ - if (get_loop_size(lo, file) != get_loop_size(lo, old_file)) + if (lo_calculate_size(lo, file) != lo_calculate_size(lo, old_file)) goto out_err; + /* + * We might switch to direct I/O mode for the loop device, write back + * all dirty data the page cache now that so that the individual I/O + * operations don't have to do that. + */ + vfs_fsync(file, 0); + /* and ... switch */ - disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); - blk_mq_freeze_queue(lo->lo_queue); + disk_force_media_change(lo->lo_disk); + memflags = blk_mq_freeze_queue(lo->lo_queue); mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); - lo->lo_backing_file = file; - lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping); - mapping_set_gfp_mask(file->f_mapping, - lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); + loop_assign_backing_file(lo, file); loop_update_dio(lo); - blk_mq_unfreeze_queue(lo->lo_queue); + blk_mq_unfreeze_queue(lo->lo_queue, memflags); partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; loop_global_unlock(lo, is_loop); @@ -624,15 +614,21 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, * dependency. */ fput(old_file); + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); if (partscan) loop_reread_partitions(lo); - return 0; + + error = 0; +done: + kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); + return error; out_err: loop_global_unlock(lo, is_loop); out_putf: fput(file); - return error; + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); + goto done; } /* loop sysfs attributes */ @@ -680,33 +676,33 @@ static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf) static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf) { - return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset); + return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_offset); } static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf) { - return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); + return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); } static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf) { int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR); - return sprintf(buf, "%s\n", autoclear ? "1" : "0"); + return sysfs_emit(buf, "%s\n", autoclear ? "1" : "0"); } static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf) { int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN); - return sprintf(buf, "%s\n", partscan ? "1" : "0"); + return sysfs_emit(buf, "%s\n", partscan ? "1" : "0"); } static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf) { int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO); - return sprintf(buf, "%s\n", dio ? "1" : "0"); + return sysfs_emit(buf, "%s\n", dio ? "1" : "0"); } LOOP_ATTR_RO(backing_file); @@ -744,12 +740,12 @@ static void loop_sysfs_exit(struct loop_device *lo) &loop_attribute_group); } -static void loop_config_discard(struct loop_device *lo) +static void loop_get_discard_config(struct loop_device *lo, + u32 *granularity, u32 *max_discard_sectors) { struct file *file = lo->lo_backing_file; struct inode *inode = file->f_mapping->host; - struct request_queue *q = lo->lo_queue; - u32 granularity, max_discard_sectors; + struct kstatfs sbuf; /* * If the backing device is a block device, mirror its zeroing @@ -759,37 +755,19 @@ static void loop_config_discard(struct loop_device *lo) * file-backed loop devices: discarded regions read back as zero. */ if (S_ISBLK(inode->i_mode)) { - struct request_queue *backingq = bdev_get_queue(I_BDEV(inode)); + struct block_device *bdev = I_BDEV(inode); - max_discard_sectors = backingq->limits.max_write_zeroes_sectors; - granularity = backingq->limits.discard_granularity ?: - queue_physical_block_size(backingq); + *max_discard_sectors = bdev_write_zeroes_sectors(bdev); + *granularity = bdev_discard_granularity(bdev); /* * We use punch hole to reclaim the free space used by the * image a.k.a. discard. */ - } else if (!file->f_op->fallocate) { - max_discard_sectors = 0; - granularity = 0; - - } else { - max_discard_sectors = UINT_MAX >> 9; - granularity = inode->i_sb->s_blocksize; - } - - if (max_discard_sectors) { - q->limits.discard_granularity = granularity; - blk_queue_max_discard_sectors(q, max_discard_sectors); - blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); - } else { - q->limits.discard_granularity = 0; - blk_queue_max_discard_sectors(q, 0); - blk_queue_max_write_zeroes_sectors(q, 0); - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q); + } else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) { + *max_discard_sectors = UINT_MAX >> 9; + *granularity = sbuf.f_bsize; } - q->limits.discard_alignment = 0; } struct loop_worker { @@ -803,8 +781,6 @@ struct loop_worker { }; static void loop_workfn(struct work_struct *work); -static void loop_rootcg_workfn(struct work_struct *work); -static void loop_free_idle_workers(struct timer_list *timer); #ifdef CONFIG_BLK_CGROUP static inline int queue_on_root_worker(struct cgroup_subsys_state *css) @@ -847,7 +823,7 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) if (worker) goto queue_work; - worker = kzalloc(sizeof(struct loop_worker), GFP_NOWAIT | __GFP_NOWARN); + worker = kzalloc(sizeof(struct loop_worker), GFP_NOWAIT); /* * In the event we cannot allocate a worker, just queue on the * rootcg worker and issue the I/O as the rootcg @@ -888,22 +864,37 @@ queue_work: spin_unlock_irq(&lo->lo_work_lock); } -static void loop_update_rotational(struct loop_device *lo) +static void loop_set_timer(struct loop_device *lo) { - struct file *file = lo->lo_backing_file; - struct inode *file_inode = file->f_mapping->host; - struct block_device *file_bdev = file_inode->i_sb->s_bdev; - struct request_queue *q = lo->lo_queue; - bool nonrot = true; + timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT); +} - /* not all filesystems (e.g. tmpfs) have a sb->s_bdev */ - if (file_bdev) - nonrot = blk_queue_nonrot(bdev_get_queue(file_bdev)); +static void loop_free_idle_workers(struct loop_device *lo, bool delete_all) +{ + struct loop_worker *pos, *worker; - if (nonrot) - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - else - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); + spin_lock_irq(&lo->lo_work_lock); + list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, + idle_list) { + if (!delete_all && + time_is_after_jiffies(worker->last_ran_at + + LOOP_IDLE_WORKER_TIMEOUT)) + break; + list_del(&worker->idle_list); + rb_erase(&worker->rb_node, &lo->worker_tree); + css_put(worker->blkcg_css); + kfree(worker); + } + if (!list_empty(&lo->idle_worker_list)) + loop_set_timer(lo); + spin_unlock_irq(&lo->lo_work_lock); +} + +static void loop_free_idle_workers_timer(struct timer_list *timer) +{ + struct loop_device *lo = container_of(timer, struct loop_device, timer); + + return loop_free_idle_workers(lo, false); } /** @@ -934,29 +925,80 @@ loop_set_status_from_info(struct loop_device *lo, return -EINVAL; } + /* Avoid assigning overflow values */ + if (info->lo_offset > LLONG_MAX || info->lo_sizelimit > LLONG_MAX) + return -EOVERFLOW; + lo->lo_offset = info->lo_offset; lo->lo_sizelimit = info->lo_sizelimit; + memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); lo->lo_file_name[LO_NAME_SIZE-1] = 0; - lo->lo_flags = info->lo_flags; return 0; } -static int loop_configure(struct loop_device *lo, fmode_t mode, +static unsigned int loop_default_blocksize(struct loop_device *lo) +{ + /* In case of direct I/O, match underlying minimum I/O size */ + if (lo->lo_flags & LO_FLAGS_DIRECT_IO) + return lo->lo_min_dio_size; + return SECTOR_SIZE; +} + +static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim, + unsigned int bsize) +{ + struct file *file = lo->lo_backing_file; + struct inode *inode = file->f_mapping->host; + struct block_device *backing_bdev = NULL; + u32 granularity = 0, max_discard_sectors = 0; + + if (S_ISBLK(inode->i_mode)) + backing_bdev = I_BDEV(inode); + else if (inode->i_sb->s_bdev) + backing_bdev = inode->i_sb->s_bdev; + + if (!bsize) + bsize = loop_default_blocksize(lo); + + loop_get_discard_config(lo, &granularity, &max_discard_sectors); + + lim->logical_block_size = bsize; + lim->physical_block_size = bsize; + lim->io_min = bsize; + lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL); + if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY)) + lim->features |= BLK_FEAT_WRITE_CACHE; + if (backing_bdev && !bdev_nonrot(backing_bdev)) + lim->features |= BLK_FEAT_ROTATIONAL; + lim->max_hw_discard_sectors = max_discard_sectors; + lim->max_write_zeroes_sectors = max_discard_sectors; + if (max_discard_sectors) + lim->discard_granularity = granularity; + else + lim->discard_granularity = 0; +} + +static int loop_configure(struct loop_device *lo, blk_mode_t mode, struct block_device *bdev, const struct loop_config *config) { struct file *file = fget(config->fd); - struct inode *inode; - struct address_space *mapping; + struct queue_limits lim; int error; loff_t size; bool partscan; - unsigned short bsize; bool is_loop; if (!file) return -EBADF; + + error = loop_check_backing_file(file); + if (error) { + fput(file); + return error; + } + is_loop = is_loop_device(file); /* This is safe, since we have a reference from open(). */ @@ -966,8 +1008,8 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, * If we don't hold exclusive handle for the device, upgrade to it * here to avoid changing device under exclusive owner. */ - if (!(mode & FMODE_EXCL)) { - error = bd_prepare_to_claim(bdev, loop_configure); + if (!(mode & BLK_OPEN_EXCL)) { + error = bd_prepare_to_claim(bdev, loop_configure, NULL); if (error) goto out_putf; } @@ -984,73 +1026,57 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, if (error) goto out_unlock; - mapping = file->f_mapping; - inode = mapping->host; - if ((config->info.lo_flags & ~LOOP_CONFIGURE_SETTABLE_FLAGS) != 0) { error = -EINVAL; goto out_unlock; } - if (config->block_size) { - error = blk_validate_block_size(config->block_size); - if (error) - goto out_unlock; - } - error = loop_set_status_from_info(lo, &config->info); if (error) goto out_unlock; + lo->lo_flags = config->info.lo_flags; - if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) || + if (!(file->f_mode & FMODE_WRITE) || !(mode & BLK_OPEN_WRITE) || !file->f_op->write_iter) lo->lo_flags |= LO_FLAGS_READ_ONLY; - lo->workqueue = alloc_workqueue("loop%d", - WQ_UNBOUND | WQ_FREEZABLE, - 0, - lo->lo_number); if (!lo->workqueue) { - error = -ENOMEM; - goto out_unlock; + lo->workqueue = alloc_workqueue("loop%d", + WQ_UNBOUND | WQ_FREEZABLE, + 0, lo->lo_number); + if (!lo->workqueue) { + error = -ENOMEM; + goto out_unlock; + } } - disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); + /* suppress uevents while reconfiguring the device */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); + + disk_force_media_change(lo->lo_disk); set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); - INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); - INIT_LIST_HEAD(&lo->rootcg_cmd_list); - INIT_LIST_HEAD(&lo->idle_worker_list); - lo->worker_tree = RB_ROOT; - timer_setup(&lo->timer, loop_free_idle_workers, - TIMER_DEFERRABLE); - lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO; lo->lo_device = bdev; - lo->lo_backing_file = file; - lo->old_gfp_mask = mapping_gfp_mask(mapping); - mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); - - if (!(lo->lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) - blk_queue_write_cache(lo->lo_queue, true, false); + loop_assign_backing_file(lo, file); - if (config->block_size) - bsize = config->block_size; - else if ((lo->lo_backing_file->f_flags & O_DIRECT) && inode->i_sb->s_bdev) - /* In case of direct I/O, match underlying block size */ - bsize = bdev_logical_block_size(inode->i_sb->s_bdev); - else - bsize = 512; + lim = queue_limits_start_update(lo->lo_queue); + loop_update_limits(lo, &lim, config->block_size); + /* No need to freeze the queue as the device isn't bound yet. */ + error = queue_limits_commit_update(lo->lo_queue, &lim); + if (error) + goto out_unlock; - blk_queue_logical_block_size(lo->lo_queue, bsize); - blk_queue_physical_block_size(lo->lo_queue, bsize); - blk_queue_io_min(lo->lo_queue, bsize); + /* + * We might switch to direct I/O mode for the loop device, write back + * all dirty data the page cache now that so that the individual I/O + * operations don't have to do that. + */ + vfs_fsync(file, 0); - loop_config_discard(lo); - loop_update_rotational(lo); loop_update_dio(lo); loop_sysfs_init(lo); - size = get_loop_size(lo, file); + size = lo_calculate_size(lo, file); loop_set_size(lo, size); /* Order wrt reading lo_state in loop_validate_file(). */ @@ -1061,19 +1087,24 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, lo->lo_flags |= LO_FLAGS_PARTSCAN; partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; if (partscan) - lo->lo_disk->flags &= ~GENHD_FL_NO_PART; + clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); + + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); + kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); loop_global_unlock(lo, is_loop); if (partscan) loop_reread_partitions(lo); - if (!(mode & FMODE_EXCL)) + + if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(bdev, loop_configure); + return 0; out_unlock: loop_global_unlock(lo, is_loop); out_bdev: - if (!(mode & FMODE_EXCL)) + if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(bdev, loop_configure); out_putf: fput(file); @@ -1084,48 +1115,9 @@ out_putf: static void __loop_clr_fd(struct loop_device *lo) { + struct queue_limits lim; struct file *filp; gfp_t gfp = lo->old_gfp_mask; - struct loop_worker *pos, *worker; - - /* - * Flush loop_configure() and loop_change_fd(). It is acceptable for - * loop_validate_file() to succeed, for actual clear operation has not - * started yet. - */ - mutex_lock(&loop_validate_mutex); - mutex_unlock(&loop_validate_mutex); - /* - * loop_validate_file() now fails because l->lo_state != Lo_bound - * became visible. - */ - - /* - * Since this function is called upon "ioctl(LOOP_CLR_FD)" xor "close() - * after ioctl(LOOP_CLR_FD)", it is a sign of something going wrong if - * lo->lo_state has changed while waiting for lo->lo_mutex. - */ - mutex_lock(&lo->lo_mutex); - BUG_ON(lo->lo_state != Lo_rundown); - mutex_unlock(&lo->lo_mutex); - - if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags)) - blk_queue_write_cache(lo->lo_queue, false, false); - - /* freeze request queue during the transition */ - blk_mq_freeze_queue(lo->lo_queue); - - destroy_workqueue(lo->workqueue); - spin_lock_irq(&lo->lo_work_lock); - list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, - idle_list) { - list_del(&worker->idle_list); - rb_erase(&worker->rb_node, &lo->worker_tree); - css_put(worker->blkcg_css); - kfree(worker); - } - spin_unlock_irq(&lo->lo_work_lock); - del_timer_sync(&lo->timer); spin_lock_irq(&lo->lo_lock); filp = lo->lo_backing_file; @@ -1136,100 +1128,99 @@ static void __loop_clr_fd(struct loop_device *lo) lo->lo_offset = 0; lo->lo_sizelimit = 0; memset(lo->lo_file_name, 0, LO_NAME_SIZE); - blk_queue_logical_block_size(lo->lo_queue, 512); - blk_queue_physical_block_size(lo->lo_queue, 512); - blk_queue_io_min(lo->lo_queue, 512); + + /* + * Reset the block size to the default. + * + * No queue freezing needed because this is called from the final + * ->release call only, so there can't be any outstanding I/O. + */ + lim = queue_limits_start_update(lo->lo_queue); + lim.logical_block_size = SECTOR_SIZE; + lim.physical_block_size = SECTOR_SIZE; + lim.io_min = SECTOR_SIZE; + queue_limits_commit_update(lo->lo_queue, &lim); + invalidate_disk(lo->lo_disk); loop_sysfs_exit(lo); /* let user-space know about this change */ kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); mapping_set_gfp_mask(filp->f_mapping, gfp); - blk_mq_unfreeze_queue(lo->lo_queue); + /* This is safe: open() is still holding a reference. */ + module_put(THIS_MODULE); - disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); + disk_force_media_change(lo->lo_disk); if (lo->lo_flags & LO_FLAGS_PARTSCAN) { int err; - mutex_lock(&lo->lo_disk->open_mutex); + /* + * open_mutex has been held already in release path, so don't + * acquire it if this function is called in such case. + * + * If the reread partition isn't from release path, lo_refcnt + * must be at least one and it can only become zero when the + * current holder is released. + */ err = bdev_disk_changed(lo->lo_disk, false); - mutex_unlock(&lo->lo_disk->open_mutex); if (err) pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", __func__, lo->lo_number, err); /* Device is gone, no point in returning error */ } + /* + * lo->lo_state is set to Lo_unbound here after above partscan has + * finished. There cannot be anybody else entering __loop_clr_fd() as + * Lo_rundown state protects us from all the other places trying to + * change the 'lo' device. + */ lo->lo_flags = 0; if (!part_shift) - lo->lo_disk->flags |= GENHD_FL_NO_PART; - - fput(filp); -} - -static void loop_rundown_completed(struct loop_device *lo) -{ + set_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); mutex_lock(&lo->lo_mutex); lo->lo_state = Lo_unbound; mutex_unlock(&lo->lo_mutex); - module_put(THIS_MODULE); -} - -static void loop_rundown_workfn(struct work_struct *work) -{ - struct loop_device *lo = container_of(work, struct loop_device, - rundown_work); - struct block_device *bdev = lo->lo_device; - struct gendisk *disk = lo->lo_disk; - - __loop_clr_fd(lo); - kobject_put(&bdev->bd_device.kobj); - module_put(disk->fops->owner); - loop_rundown_completed(lo); -} -static void loop_schedule_rundown(struct loop_device *lo) -{ - struct block_device *bdev = lo->lo_device; - struct gendisk *disk = lo->lo_disk; - - __module_get(disk->fops->owner); - kobject_get(&bdev->bd_device.kobj); - INIT_WORK(&lo->rundown_work, loop_rundown_workfn); - queue_work(system_long_wq, &lo->rundown_work); + /* + * Need not hold lo_mutex to fput backing file. Calling fput holding + * lo_mutex triggers a circular lock dependency possibility warning as + * fput can take open_mutex which is usually taken before lo_mutex. + */ + fput(filp); } static int loop_clr_fd(struct loop_device *lo) { int err; - err = mutex_lock_killable(&lo->lo_mutex); + /* + * Since lo_ioctl() is called without locks held, it is possible that + * loop_configure()/loop_change_fd() and loop_clr_fd() run in parallel. + * + * Therefore, use global lock when setting Lo_rundown state in order to + * make sure that loop_validate_file() will fail if the "struct file" + * which loop_configure()/loop_change_fd() found via fget() was this + * loop device. + */ + err = loop_global_lock_killable(lo, true); if (err) return err; if (lo->lo_state != Lo_bound) { - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); return -ENXIO; } /* - * If we've explicitly asked to tear down the loop device, - * and it has an elevated reference count, set it for auto-teardown when - * the last reference goes away. This stops $!~#$@ udev from - * preventing teardown because it decided that it needs to run blkid on - * the loopback device whenever they appear. xfstests is notorious for - * failing tests because blkid via udev races with a losetup - * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d - * command to fail with EBUSY. + * Mark the device for removing the backing device on last close. + * If we are the only opener, also switch the state to roundown here to + * prevent new openers from coming in. */ - if (atomic_read(&lo->lo_refcnt) > 1) { - lo->lo_flags |= LO_FLAGS_AUTOCLEAR; - mutex_unlock(&lo->lo_mutex); - return 0; - } - lo->lo_state = Lo_rundown; - mutex_unlock(&lo->lo_mutex); - __loop_clr_fd(lo); - loop_rundown_completed(lo); + lo->lo_flags |= LO_FLAGS_AUTOCLEAR; + if (disk_openers(lo->lo_disk) == 1) + lo->lo_state = Lo_rundown; + loop_global_unlock(lo, true); + return 0; } @@ -1237,9 +1228,9 @@ static int loop_set_status(struct loop_device *lo, const struct loop_info64 *info) { int err; - int prev_lo_flags; bool partscan = false; bool size_changed = false; + unsigned int memflags; err = mutex_lock_killable(&lo->lo_mutex); if (err) @@ -1256,49 +1247,29 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) invalidate_bdev(lo->lo_device); } - /* I/O need to be drained during transfer transition */ - blk_mq_freeze_queue(lo->lo_queue); - - if (size_changed && lo->lo_device->bd_inode->i_mapping->nrpages) { - /* If any pages were dirtied after invalidate_bdev(), try again */ - err = -EAGAIN; - pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n", - __func__, lo->lo_number, lo->lo_file_name, - lo->lo_device->bd_inode->i_mapping->nrpages); - goto out_unfreeze; - } - - prev_lo_flags = lo->lo_flags; + /* I/O needs to be drained before changing lo_offset or lo_sizelimit */ + memflags = blk_mq_freeze_queue(lo->lo_queue); err = loop_set_status_from_info(lo, info); if (err) goto out_unfreeze; - /* Mask out flags that can't be set using LOOP_SET_STATUS. */ - lo->lo_flags &= LOOP_SET_STATUS_SETTABLE_FLAGS; - /* For those flags, use the previous values instead */ - lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_SETTABLE_FLAGS; - /* For flags that can't be cleared, use previous values too */ - lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_CLEARABLE_FLAGS; + partscan = !(lo->lo_flags & LO_FLAGS_PARTSCAN) && + (info->lo_flags & LO_FLAGS_PARTSCAN); - if (size_changed) { - loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit, - lo->lo_backing_file); - loop_set_size(lo, new_size); - } + lo->lo_flags &= ~LOOP_SET_STATUS_CLEARABLE_FLAGS; + lo->lo_flags |= (info->lo_flags & LOOP_SET_STATUS_SETTABLE_FLAGS); - loop_config_discard(lo); - - /* update dio if lo_offset or transfer is changed */ - __loop_update_dio(lo, lo->use_dio); + /* update the direct I/O flag if lo_offset changed */ + loop_update_dio(lo); out_unfreeze: - blk_mq_unfreeze_queue(lo->lo_queue); - - if (!err && (lo->lo_flags & LO_FLAGS_PARTSCAN) && - !(prev_lo_flags & LO_FLAGS_PARTSCAN)) { - lo->lo_disk->flags &= ~GENHD_FL_NO_PART; - partscan = true; + blk_mq_unfreeze_queue(lo->lo_queue, memflags); + if (partscan) + clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); + if (!err && size_changed) { + loff_t new_size = lo_calculate_size(lo, lo->lo_backing_file); + loop_set_size(lo, new_size); } out_unlock: mutex_unlock(&lo->lo_mutex); @@ -1440,7 +1411,7 @@ static int loop_set_capacity(struct loop_device *lo) if (unlikely(lo->lo_state != Lo_bound)) return -ENXIO; - size = get_loop_size(lo, lo->lo_backing_file); + size = lo_calculate_size(lo, lo->lo_backing_file); loop_set_size(lo, size); return 0; @@ -1448,53 +1419,75 @@ static int loop_set_capacity(struct loop_device *lo) static int loop_set_dio(struct loop_device *lo, unsigned long arg) { - int error = -ENXIO; - if (lo->lo_state != Lo_bound) - goto out; + bool use_dio = !!arg; + unsigned int memflags; - __loop_update_dio(lo, !!arg); - if (lo->use_dio == !!arg) + if (lo->lo_state != Lo_bound) + return -ENXIO; + if (use_dio == !!(lo->lo_flags & LO_FLAGS_DIRECT_IO)) return 0; - error = -EINVAL; - out: - return error; + + if (use_dio) { + if (!lo_can_use_dio(lo)) + return -EINVAL; + /* flush dirty pages before starting to use direct I/O */ + vfs_fsync(lo->lo_backing_file, 0); + } + + memflags = blk_mq_freeze_queue(lo->lo_queue); + if (use_dio) + lo->lo_flags |= LO_FLAGS_DIRECT_IO; + else + lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; + blk_mq_unfreeze_queue(lo->lo_queue, memflags); + return 0; } -static int loop_set_block_size(struct loop_device *lo, unsigned long arg) +static int loop_set_block_size(struct loop_device *lo, blk_mode_t mode, + struct block_device *bdev, unsigned long arg) { + struct queue_limits lim; + unsigned int memflags; int err = 0; - if (lo->lo_state != Lo_bound) - return -ENXIO; + /* + * If we don't hold exclusive handle for the device, upgrade to it + * here to avoid changing device under exclusive owner. + */ + if (!(mode & BLK_OPEN_EXCL)) { + err = bd_prepare_to_claim(bdev, loop_set_block_size, NULL); + if (err) + return err; + } - err = blk_validate_block_size(arg); + err = mutex_lock_killable(&lo->lo_mutex); if (err) - return err; + goto abort_claim; + + if (lo->lo_state != Lo_bound) { + err = -ENXIO; + goto unlock; + } if (lo->lo_queue->limits.logical_block_size == arg) - return 0; + goto unlock; sync_blockdev(lo->lo_device); invalidate_bdev(lo->lo_device); - blk_mq_freeze_queue(lo->lo_queue); + lim = queue_limits_start_update(lo->lo_queue); + loop_update_limits(lo, &lim, arg); - /* invalidate_bdev should have truncated all the pages */ - if (lo->lo_device->bd_inode->i_mapping->nrpages) { - err = -EAGAIN; - pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n", - __func__, lo->lo_number, lo->lo_file_name, - lo->lo_device->bd_inode->i_mapping->nrpages); - goto out_unfreeze; - } - - blk_queue_logical_block_size(lo->lo_queue, arg); - blk_queue_physical_block_size(lo->lo_queue, arg); - blk_queue_io_min(lo->lo_queue, arg); + memflags = blk_mq_freeze_queue(lo->lo_queue); + err = queue_limits_commit_update(lo->lo_queue, &lim); loop_update_dio(lo); -out_unfreeze: - blk_mq_unfreeze_queue(lo->lo_queue); + blk_mq_unfreeze_queue(lo->lo_queue, memflags); +unlock: + mutex_unlock(&lo->lo_mutex); +abort_claim: + if (!(mode & BLK_OPEN_EXCL)) + bd_abort_claiming(bdev, loop_set_block_size); return err; } @@ -1513,9 +1506,6 @@ static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd, case LOOP_SET_DIRECT_IO: err = loop_set_dio(lo, arg); break; - case LOOP_SET_BLOCK_SIZE: - err = loop_set_block_size(lo, arg); - break; default: err = -EINVAL; } @@ -1523,7 +1513,7 @@ static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd, return err; } -static int lo_ioctl(struct block_device *bdev, fmode_t mode, +static int lo_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct loop_device *lo = bdev->bd_disk->private_data; @@ -1558,24 +1548,25 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, return loop_clr_fd(lo); case LOOP_SET_STATUS: err = -EPERM; - if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) { + if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN)) err = loop_set_status_old(lo, argp); - } break; case LOOP_GET_STATUS: return loop_get_status_old(lo, argp); case LOOP_SET_STATUS64: err = -EPERM; - if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) { + if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN)) err = loop_set_status64(lo, argp); - } break; case LOOP_GET_STATUS64: return loop_get_status64(lo, argp); + case LOOP_SET_BLOCK_SIZE: + if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + return loop_set_block_size(lo, mode, bdev, arg); case LOOP_SET_CAPACITY: case LOOP_SET_DIRECT_IO: - case LOOP_SET_BLOCK_SIZE: - if (!(mode & FMODE_WRITE) && !capable(CAP_SYS_ADMIN)) + if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN)) return -EPERM; fallthrough; default: @@ -1593,6 +1584,7 @@ struct compat_loop_info { compat_ulong_t lo_inode; /* ioctl r/o */ compat_dev_t lo_rdevice; /* ioctl r/o */ compat_int_t lo_offset; + compat_int_t lo_encrypt_type; /* obsolete, ignored */ compat_int_t lo_encrypt_key_size; /* ioctl w/o */ compat_int_t lo_flags; /* ioctl r/o */ char lo_name[LO_NAME_SIZE]; @@ -1685,7 +1677,7 @@ loop_get_status_compat(struct loop_device *lo, return err; } -static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, +static int lo_compat_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct loop_device *lo = bdev->bd_disk->private_data; @@ -1721,72 +1713,139 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, } #endif -static int lo_open(struct block_device *bdev, fmode_t mode) +static int lo_open(struct gendisk *disk, blk_mode_t mode) { - struct loop_device *lo = bdev->bd_disk->private_data; + struct loop_device *lo = disk->private_data; int err; err = mutex_lock_killable(&lo->lo_mutex); if (err) return err; - if (lo->lo_state == Lo_deleting) + + if (lo->lo_state == Lo_deleting || lo->lo_state == Lo_rundown) err = -ENXIO; - else - atomic_inc(&lo->lo_refcnt); mutex_unlock(&lo->lo_mutex); return err; } -static void lo_release(struct gendisk *disk, fmode_t mode) +static void lo_release(struct gendisk *disk) { struct loop_device *lo = disk->private_data; + bool need_clear = false; - mutex_lock(&lo->lo_mutex); - if (atomic_dec_return(&lo->lo_refcnt)) - goto out_unlock; + if (disk_openers(disk) > 0) + return; + /* + * Clear the backing device information if this is the last close of + * a device that's been marked for auto clear, or on which LOOP_CLR_FD + * has been called. + */ - if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) { - if (lo->lo_state != Lo_bound) - goto out_unlock; + mutex_lock(&lo->lo_mutex); + if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) lo->lo_state = Lo_rundown; - mutex_unlock(&lo->lo_mutex); - /* - * In autoclear mode, stop the loop thread - * and remove configuration after last close. - */ - loop_schedule_rundown(lo); - return; - } else if (lo->lo_state == Lo_bound) { - /* - * Otherwise keep thread (if running) and config, - * but flush possible ongoing bios in thread. - */ - blk_mq_freeze_queue(lo->lo_queue); - blk_mq_unfreeze_queue(lo->lo_queue); - } -out_unlock: + need_clear = (lo->lo_state == Lo_rundown); mutex_unlock(&lo->lo_mutex); + + if (need_clear) + __loop_clr_fd(lo); +} + +static void lo_free_disk(struct gendisk *disk) +{ + struct loop_device *lo = disk->private_data; + + if (lo->workqueue) + destroy_workqueue(lo->workqueue); + loop_free_idle_workers(lo, true); + timer_shutdown_sync(&lo->timer); + mutex_destroy(&lo->lo_mutex); + kfree(lo); } static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, - .open = lo_open, + .open = lo_open, .release = lo_release, .ioctl = lo_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = lo_compat_ioctl, #endif + .free_disk = lo_free_disk, }; /* * And now the modules code and kernel interface. */ -static int max_loop; -module_param(max_loop, int, 0444); + +/* + * If max_loop is specified, create that many devices upfront. + * This also becomes a hard limit. If max_loop is not specified, + * the default isn't a hard limit (as before commit 85c50197716c + * changed the default value from 0 for max_loop=0 reasons), just + * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module + * init time. Loop devices can be requested on-demand with the + * /dev/loop-control interface, or be instantiated by accessing + * a 'dead' device node. + */ +static int max_loop = CONFIG_BLK_DEV_LOOP_MIN_COUNT; + +#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD +static bool max_loop_specified; + +static int max_loop_param_set_int(const char *val, + const struct kernel_param *kp) +{ + int ret; + + ret = param_set_int(val, kp); + if (ret < 0) + return ret; + + max_loop_specified = true; + return 0; +} + +static const struct kernel_param_ops max_loop_param_ops = { + .set = max_loop_param_set_int, + .get = param_get_int, +}; + +module_param_cb(max_loop, &max_loop_param_ops, &max_loop, 0444); MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); +#else +module_param(max_loop, int, 0444); +MODULE_PARM_DESC(max_loop, "Initial number of loop devices"); +#endif + module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); + +static int hw_queue_depth = LOOP_DEFAULT_HW_Q_DEPTH; + +static int loop_set_hw_queue_depth(const char *s, const struct kernel_param *p) +{ + int qd, ret; + + ret = kstrtoint(s, 0, &qd); + if (ret < 0) + return ret; + if (qd < 1) + return -EINVAL; + hw_queue_depth = qd; + return 0; +} + +static const struct kernel_param_ops loop_hw_qdepth_param_ops = { + .set = loop_set_hw_queue_depth, + .get = param_get_int, +}; + +device_param_cb(hw_queue_depth, &loop_hw_qdepth_param_ops, &hw_queue_depth, 0444); +MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: " __stringify(LOOP_DEFAULT_HW_Q_DEPTH)); + +MODULE_DESCRIPTION("Loopback device support"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1809,7 +1868,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, cmd->use_aio = false; break; default: - cmd->use_aio = lo->use_dio; + cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO; break; } @@ -1817,12 +1876,14 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, cmd->blkcg_css = NULL; cmd->memcg_css = NULL; #ifdef CONFIG_BLK_CGROUP - if (rq->bio && rq->bio->bi_blkg) { - cmd->blkcg_css = &bio_blkcg(rq->bio)->css; + if (rq->bio) { + cmd->blkcg_css = bio_blkcg_css(rq->bio); #ifdef CONFIG_MEMCG - cmd->memcg_css = - cgroup_get_e_css(cmd->blkcg_css->cgroup, - &memory_cgrp_subsys); + if (cmd->blkcg_css) { + cmd->memcg_css = + cgroup_get_e_css(cmd->blkcg_css->cgroup, + &memory_cgrp_subsys); + } #endif } #endif @@ -1833,6 +1894,8 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, static void loop_handle_cmd(struct loop_cmd *cmd) { + struct cgroup_subsys_state *cmd_blkcg_css = cmd->blkcg_css; + struct cgroup_subsys_state *cmd_memcg_css = cmd->memcg_css; struct request *rq = blk_mq_rq_from_pdu(cmd); const bool write = op_is_write(req_op(rq)); struct loop_device *lo = rq->q->queuedata; @@ -1844,24 +1907,34 @@ static void loop_handle_cmd(struct loop_cmd *cmd) goto failed; } - if (cmd->blkcg_css) - kthread_associate_blkcg(cmd->blkcg_css); - if (cmd->memcg_css) + /* We can block in this context, so ignore REQ_NOWAIT. */ + if (rq->cmd_flags & REQ_NOWAIT) + rq->cmd_flags &= ~REQ_NOWAIT; + + if (cmd_blkcg_css) + kthread_associate_blkcg(cmd_blkcg_css); + if (cmd_memcg_css) old_memcg = set_active_memcg( - mem_cgroup_from_css(cmd->memcg_css)); + mem_cgroup_from_css(cmd_memcg_css)); + /* + * do_req_filebacked() may call blk_mq_complete_request() synchronously + * or asynchronously if using aio. Hence, do not touch 'cmd' after + * do_req_filebacked() has returned unless we are sure that 'cmd' has + * not yet been completed. + */ ret = do_req_filebacked(lo, rq); - if (cmd->blkcg_css) + if (cmd_blkcg_css) kthread_associate_blkcg(NULL); - if (cmd->memcg_css) { + if (cmd_memcg_css) { set_active_memcg(old_memcg); - css_put(cmd->memcg_css); + css_put(cmd_memcg_css); } failed: /* complete non-aio request */ - if (!cmd->use_aio || ret) { + if (ret != -EIOCBQUEUED) { if (ret == -EOPNOTSUPP) cmd->ret = ret; else @@ -1871,11 +1944,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd) } } -static void loop_set_timer(struct loop_device *lo) -{ - timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT); -} - static void loop_process_work(struct loop_worker *worker, struct list_head *cmd_list, struct loop_device *lo) { @@ -1924,27 +1992,6 @@ static void loop_rootcg_workfn(struct work_struct *work) loop_process_work(NULL, &lo->rootcg_cmd_list, lo); } -static void loop_free_idle_workers(struct timer_list *timer) -{ - struct loop_device *lo = container_of(timer, struct loop_device, timer); - struct loop_worker *pos, *worker; - - spin_lock_irq(&lo->lo_work_lock); - list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, - idle_list) { - if (time_is_after_jiffies(worker->last_ran_at + - LOOP_IDLE_WORKER_TIMEOUT)) - break; - list_del(&worker->idle_list); - rb_erase(&worker->rb_node, &lo->worker_tree); - css_put(worker->blkcg_css); - kfree(worker); - } - if (!list_empty(&lo->idle_worker_list)) - loop_set_timer(lo); - spin_unlock_irq(&lo->lo_work_lock); -} - static const struct blk_mq_ops loop_mq_ops = { .queue_rq = loop_queue_rq, .complete = lo_complete_rq, @@ -1952,6 +1999,12 @@ static const struct blk_mq_ops loop_mq_ops = { static int loop_add(int i) { + struct queue_limits lim = { + /* + * Random number picked from the historic block max_sectors cap. + */ + .max_hw_sectors = 2560u, + }; struct loop_device *lo; struct gendisk *disk; int err; @@ -1960,6 +2013,9 @@ static int loop_add(int i) lo = kzalloc(sizeof(*lo), GFP_KERNEL); if (!lo) goto out; + lo->worker_tree = RB_ROOT; + INIT_LIST_HEAD(&lo->idle_worker_list); + timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE); lo->lo_state = Lo_unbound; err = mutex_lock_killable(&loop_ctl_mutex); @@ -1981,34 +2037,23 @@ static int loop_add(int i) lo->tag_set.ops = &loop_mq_ops; lo->tag_set.nr_hw_queues = 1; - lo->tag_set.queue_depth = 128; + lo->tag_set.queue_depth = hw_queue_depth; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING | - BLK_MQ_F_NO_SCHED_BY_DEFAULT; + lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); if (err) goto out_free_idr; - disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, lo); + disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, &lim, lo); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_cleanup_tags; } lo->lo_queue = lo->lo_disk->queue; - blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS); - - /* - * By default, we do buffer IO, so it doesn't make sense to enable - * merge because the I/O submitted to backing file is handled page by - * page. For directio mode, merge does help to dispatch bigger request - * to underlayer disk. We will enable merge once directio is enabled. - */ - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue); - /* * Disable partition scanning by default. The in-kernel partition * scanning can be requested individually per-device during its @@ -2028,12 +2073,13 @@ static int loop_add(int i) * userspace tools. Parameters like this in general should be avoided. */ if (!part_shift) - disk->flags |= GENHD_FL_NO_PART; - atomic_set(&lo->lo_refcnt, 0); + set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); mutex_init(&lo->lo_mutex); lo->lo_number = i; spin_lock_init(&lo->lo_lock); spin_lock_init(&lo->lo_work_lock); + INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); + INIT_LIST_HEAD(&lo->rootcg_cmd_list); disk->major = LOOP_MAJOR; disk->first_minor = i << part_shift; disk->minors = 1 << part_shift; @@ -2056,7 +2102,7 @@ static int loop_add(int i) return i; out_cleanup_disk: - blk_cleanup_disk(disk); + put_disk(disk); out_cleanup_tags: blk_mq_free_tag_set(&lo->tag_set); out_free_idr: @@ -2073,24 +2119,27 @@ static void loop_remove(struct loop_device *lo) { /* Make this loop device unreachable from pathname. */ del_gendisk(lo->lo_disk); - blk_cleanup_disk(lo->lo_disk); blk_mq_free_tag_set(&lo->tag_set); + mutex_lock(&loop_ctl_mutex); idr_remove(&loop_index_idr, lo->lo_number); mutex_unlock(&loop_ctl_mutex); - /* There is no route which can find this loop device. */ - mutex_destroy(&lo->lo_mutex); - kfree(lo); + + put_disk(lo->lo_disk); } +#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD static void loop_probe(dev_t dev) { int idx = MINOR(dev) >> part_shift; - if (max_loop && idx >= max_loop) + if (max_loop_specified && max_loop && idx >= max_loop) return; loop_add(idx); } +#else +#define loop_probe NULL +#endif /* !CONFIG_BLOCK_LEGACY_AUTOLOAD */ static int loop_control_remove(int idx) { @@ -2119,13 +2168,12 @@ static int loop_control_remove(int idx) ret = mutex_lock_killable(&lo->lo_mutex); if (ret) goto mark_visible; - if (lo->lo_state != Lo_unbound || - atomic_read(&lo->lo_refcnt) > 0) { + if (lo->lo_state != Lo_unbound || disk_openers(lo->lo_disk) > 0) { mutex_unlock(&lo->lo_mutex); ret = -EBUSY; goto mark_visible; } - /* Mark this loop device no longer open()-able. */ + /* Mark this loop device as no more bound, but not quite unbound yet */ lo->lo_state = Lo_deleting; mutex_unlock(&lo->lo_mutex); @@ -2194,7 +2242,7 @@ MODULE_ALIAS("devname:loop-control"); static int __init loop_init(void) { - int i, nr; + int i; int err; part_shift = 0; @@ -2222,19 +2270,6 @@ static int __init loop_init(void) goto err_out; } - /* - * If max_loop is specified, create that many devices upfront. - * This also becomes a hard limit. If max_loop is not specified, - * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module - * init time. Loop devices can be requested on-demand with the - * /dev/loop-control interface, or be instantiated by accessing - * a 'dead' device node. - */ - if (max_loop) - nr = max_loop; - else - nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT; - err = misc_register(&loop_misc); if (err < 0) goto err_out; @@ -2246,7 +2281,7 @@ static int __init loop_init(void) } /* pre-create number of devices given by config or max_loop */ - for (i = 0; i < nr; i++) + for (i = 0; i < max_loop; i++) loop_add(i); printk(KERN_INFO "loop: module loaded\n"); @@ -2285,6 +2320,9 @@ module_exit(loop_exit); static int __init max_loop_setup(char *str) { max_loop = simple_strtol(str, NULL, 0); +#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD + max_loop_specified = true; +#endif return 1; } |
