diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-01-08 11:11:51 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-01-08 11:11:51 -0800 |
commit | bb93c5ed457fe76597c14717eb994fc5aef22716 (patch) | |
tree | 99774d9b73c3b7900711d88ee1680d51f23f6b32 /fs/backing-file.c | |
parent | 8c9440fea77440772542d6dbcb5c36182495c164 (diff) | |
parent | c39e2ae3943d4ee278af4e1b1dcfd5946da1089b (diff) |
Merge tag 'vfs-6.8.rw' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs rw updates from Christian Brauner:
"This contains updates from Amir for read-write backing file helpers
for stacking filesystems such as overlayfs:
- Fanotify is currently in the process of introducing pre content
events. Roughly, a new permission event will be added indicating
that it is safe to write to the file being accessed. These events
are used by hierarchical storage managers to e.g., fill the content
of files on first access.
During that work we noticed that our current permission checking is
inconsistent in rw_verify_area() and remap_verify_area().
Especially in the splice code permission checking is done multiple
times. For example, one time for the whole range and then again for
partial ranges inside the iterator.
In addition, we mostly do permission checking before we call
file_start_write() except for a few places where we call it after.
For pre-content events we need such permission checking to be done
before file_start_write(). So this is a nice reason to clean this
all up.
After this series, all permission checking is done before
file_start_write().
As part of this cleanup we also massaged the splice code a bit. We
got rid of a few helpers because we are alredy drowning in special
read-write helpers. We also cleaned up the return types for splice
helpers.
- Introduce generic read-write helpers for backing files. This lifts
some overlayfs code to common code so it can be used by the FUSE
passthrough work coming in over the next cycles. Make Amir and
Miklos the maintainers for this new subsystem of the vfs"
* tag 'vfs-6.8.rw' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (30 commits)
fs: fix __sb_write_started() kerneldoc formatting
fs: factor out backing_file_mmap() helper
fs: factor out backing_file_splice_{read,write}() helpers
fs: factor out backing_file_{read,write}_iter() helpers
fs: prepare for stackable filesystems backing file helpers
fsnotify: optionally pass access range in file permission hooks
fsnotify: assert that file_start_write() is not held in permission hooks
fsnotify: split fsnotify_perm() into two hooks
fs: use splice_copy_file_range() inline helper
splice: return type ssize_t from all helpers
fs: use do_splice_direct() for nfsd/ksmbd server-side-copy
fs: move file_start_write() into direct_splice_actor()
fs: fork splice_file_range() from do_splice_direct()
fs: create {sb,file}_write_not_started() helpers
fs: create file_write_started() helper
fs: create __sb_write_started() helper
fs: move kiocb_start_write() into vfs_iocb_iter_write()
fs: move permission hook out of do_iter_read()
fs: move permission hook out of do_iter_write()
fs: move file_start_write() into vfs_iter_write()
...
Diffstat (limited to 'fs/backing-file.c')
-rw-r--r-- | fs/backing-file.c | 336 |
1 files changed, 336 insertions, 0 deletions
diff --git a/fs/backing-file.c b/fs/backing-file.c new file mode 100644 index 000000000000..a681f38d84d8 --- /dev/null +++ b/fs/backing-file.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Common helpers for stackable filesystems and backing files. + * + * Forked from fs/overlayfs/file.c. + * + * Copyright (C) 2017 Red Hat, Inc. + * Copyright (C) 2023 CTERA Networks. + */ + +#include <linux/fs.h> +#include <linux/backing-file.h> +#include <linux/splice.h> +#include <linux/mm.h> + +#include "internal.h" + +/** + * backing_file_open - open a backing file for kernel internal use + * @user_path: path that the user reuqested to open + * @flags: open flags + * @real_path: path of the backing file + * @cred: credentials for open + * + * Open a backing file for a stackable filesystem (e.g., overlayfs). + * @user_path may be on the stackable filesystem and @real_path on the + * underlying filesystem. In this case, we want to be able to return the + * @user_path of the stackable filesystem. This is done by embedding the + * returned file into a container structure that also stores the stacked + * file's path, which can be retrieved using backing_file_user_path(). + */ +struct file *backing_file_open(const struct path *user_path, int flags, + const struct path *real_path, + const struct cred *cred) +{ + struct file *f; + int error; + + f = alloc_empty_backing_file(flags, cred); + if (IS_ERR(f)) + return f; + + path_get(user_path); + *backing_file_user_path(f) = *user_path; + error = vfs_open(real_path, f); + if (error) { + fput(f); + f = ERR_PTR(error); + } + + return f; +} +EXPORT_SYMBOL_GPL(backing_file_open); + +struct backing_aio { + struct kiocb iocb; + refcount_t ref; + struct kiocb *orig_iocb; + /* used for aio completion */ + void (*end_write)(struct file *); + struct work_struct work; + long res; +}; + +static struct kmem_cache *backing_aio_cachep; + +#define BACKING_IOCB_MASK \ + (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND) + +static rwf_t iocb_to_rw_flags(int flags) +{ + return (__force rwf_t)(flags & BACKING_IOCB_MASK); +} + +static void backing_aio_put(struct backing_aio *aio) +{ + if (refcount_dec_and_test(&aio->ref)) { + fput(aio->iocb.ki_filp); + kmem_cache_free(backing_aio_cachep, aio); + } +} + +static void backing_aio_cleanup(struct backing_aio *aio, long res) +{ + struct kiocb *iocb = &aio->iocb; + struct kiocb *orig_iocb = aio->orig_iocb; + + if (aio->end_write) + aio->end_write(orig_iocb->ki_filp); + + orig_iocb->ki_pos = iocb->ki_pos; + backing_aio_put(aio); +} + +static void backing_aio_rw_complete(struct kiocb *iocb, long res) +{ + struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); + struct kiocb *orig_iocb = aio->orig_iocb; + + if (iocb->ki_flags & IOCB_WRITE) + kiocb_end_write(iocb); + + backing_aio_cleanup(aio, res); + orig_iocb->ki_complete(orig_iocb, res); +} + +static void backing_aio_complete_work(struct work_struct *work) +{ + struct backing_aio *aio = container_of(work, struct backing_aio, work); + + backing_aio_rw_complete(&aio->iocb, aio->res); +} + +static void backing_aio_queue_completion(struct kiocb *iocb, long res) +{ + struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); + + /* + * Punt to a work queue to serialize updates of mtime/size. + */ + aio->res = res; + INIT_WORK(&aio->work, backing_aio_complete_work); + queue_work(file_inode(aio->orig_iocb->ki_filp)->i_sb->s_dio_done_wq, + &aio->work); +} + +static int backing_aio_init_wq(struct kiocb *iocb) +{ + struct super_block *sb = file_inode(iocb->ki_filp)->i_sb; + + if (sb->s_dio_done_wq) + return 0; + + return sb_init_dio_done_wq(sb); +} + + +ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx) +{ + struct backing_aio *aio = NULL; + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) + return -EIO; + + if (!iov_iter_count(iter)) + return 0; + + if (iocb->ki_flags & IOCB_DIRECT && + !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; + + old_cred = override_creds(ctx->cred); + if (is_sync_kiocb(iocb)) { + rwf_t rwf = iocb_to_rw_flags(flags); + + ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf); + } else { + ret = -ENOMEM; + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); + if (!aio) + goto out; + + aio->orig_iocb = iocb; + kiocb_clone(&aio->iocb, iocb, get_file(file)); + aio->iocb.ki_complete = backing_aio_rw_complete; + refcount_set(&aio->ref, 2); + ret = vfs_iocb_iter_read(file, &aio->iocb, iter); + backing_aio_put(aio); + if (ret != -EIOCBQUEUED) + backing_aio_cleanup(aio, ret); + } +out: + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_read_iter); + +ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) + return -EIO; + + if (!iov_iter_count(iter)) + return 0; + + ret = file_remove_privs(ctx->user_file); + if (ret) + return ret; + + if (iocb->ki_flags & IOCB_DIRECT && + !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; + + /* + * Stacked filesystems don't support deferred completions, don't copy + * this property in case it is set by the issuer. + */ + flags &= ~IOCB_DIO_CALLER_COMP; + + old_cred = override_creds(ctx->cred); + if (is_sync_kiocb(iocb)) { + rwf_t rwf = iocb_to_rw_flags(flags); + + ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf); + if (ctx->end_write) + ctx->end_write(ctx->user_file); + } else { + struct backing_aio *aio; + + ret = backing_aio_init_wq(iocb); + if (ret) + goto out; + + ret = -ENOMEM; + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); + if (!aio) + goto out; + + aio->orig_iocb = iocb; + aio->end_write = ctx->end_write; + kiocb_clone(&aio->iocb, iocb, get_file(file)); + aio->iocb.ki_flags = flags; + aio->iocb.ki_complete = backing_aio_queue_completion; + refcount_set(&aio->ref, 2); + ret = vfs_iocb_iter_write(file, &aio->iocb, iter); + backing_aio_put(aio); + if (ret != -EIOCBQUEUED) + backing_aio_cleanup(aio, ret); + } +out: + revert_creds(old_cred); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_write_iter); + +ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING))) + return -EIO; + + old_cred = override_creds(ctx->cred); + ret = vfs_splice_read(in, ppos, pipe, len, flags); + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_splice_read); + +ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING))) + return -EIO; + + ret = file_remove_privs(ctx->user_file); + if (ret) + return ret; + + old_cred = override_creds(ctx->cred); + file_start_write(out); + ret = iter_file_splice_write(pipe, out, ppos, len, flags); + file_end_write(out); + revert_creds(old_cred); + + if (ctx->end_write) + ctx->end_write(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_splice_write); + +int backing_file_mmap(struct file *file, struct vm_area_struct *vma, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + int ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) || + WARN_ON_ONCE(ctx->user_file != vma->vm_file)) + return -EIO; + + if (!file->f_op->mmap) + return -ENODEV; + + vma_set_file(vma, file); + + old_cred = override_creds(ctx->cred); + ret = call_mmap(vma->vm_file, vma); + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_mmap); + +static int __init backing_aio_init(void) +{ + backing_aio_cachep = kmem_cache_create("backing_aio", + sizeof(struct backing_aio), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!backing_aio_cachep) + return -ENOMEM; + + return 0; +} +fs_initcall(backing_aio_init); |