From db3db63b1d17c98f69e894edaa2b0b364ecde7a9 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 4 Nov 2023 23:11:17 +0100 Subject: vfs: remove a redundant might_sleep in wait_on_inode wait_on_bit already does it. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20231104221117.2584708-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/writeback.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 083387c00f0c..6d0a14f7019d 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -193,7 +193,6 @@ void inode_io_list_del(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) { - might_sleep(); wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE); } -- cgit From 600f111ef51dc2cbdb330b09d09f1856efa64912 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Nov 2023 21:58:23 +0000 Subject: fs: Rename mapping private members It is hard to find where mapping->private_lock, mapping->private_list and mapping->private_data are used, due to private_XXX being a relatively common name for variables and structure members in the kernel. To fit with other members of struct address_space, rename them all to have an i_ prefix. Tested with an allmodconfig build. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20231117215823.2821906-1-willy@infradead.org Acked-by: Darrick J. Wong Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- include/linux/fs.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e..f171505940ff 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -463,9 +463,9 @@ extern const struct address_space_operations empty_aops; * @a_ops: Methods. * @flags: Error bits and flags (AS_*). * @wb_err: The most recent error which has occurred. - * @private_lock: For use by the owner of the address_space. - * @private_list: For use by the owner of the address_space. - * @private_data: For use by the owner of the address_space. + * @i_private_lock: For use by the owner of the address_space. + * @i_private_list: For use by the owner of the address_space. + * @i_private_data: For use by the owner of the address_space. */ struct address_space { struct inode *host; @@ -484,9 +484,9 @@ struct address_space { unsigned long flags; struct rw_semaphore i_mmap_rwsem; errseq_t wb_err; - spinlock_t private_lock; - struct list_head private_list; - void *private_data; + spinlock_t i_private_lock; + struct list_head i_private_list; + void * i_private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but -- cgit From 3652117f854819a148ff0fbe4492587d3520b5e5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:48:23 +0100 Subject: eventfd: simplify eventfd_signal() Ever since the eventfd type was introduced back in 2007 in commit e1ad7468c77d ("signal/timer/event: eventfd core") the eventfd_signal() function only ever passed 1 as a value for @n. There's no point in keeping that additional argument. Link: https://lore.kernel.org/r/20231122-vfs-eventfd-signal-v2-2-bd549b14ce0c@kernel.org Acked-by: Xu Yilun Acked-by: Andrew Donnellan # ocxl Acked-by: Eric Farman # s390 Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/eventfd.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index b9d83652c097..562089431551 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -35,7 +35,7 @@ void eventfd_ctx_put(struct eventfd_ctx *ctx); struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); -__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n); +__u64 eventfd_signal(struct eventfd_ctx *ctx); __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); @@ -58,7 +58,7 @@ static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd) return ERR_PTR(-ENOSYS); } -static inline int eventfd_signal(struct eventfd_ctx *ctx, __u64 n) +static inline int eventfd_signal(struct eventfd_ctx *ctx) { return -ENOSYS; } -- cgit From 120ae58593630819209a011a3f9c89f73bcc9894 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:48:24 +0100 Subject: eventfd: simplify eventfd_signal_mask() The eventfd_signal_mask() helper was introduced for io_uring and similar to eventfd_signal() it always passed 1 for @n. So don't bother with that argument at all. Link: https://lore.kernel.org/r/20231122-vfs-eventfd-signal-v2-3-bd549b14ce0c@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/eventfd.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 562089431551..971943ecb2a6 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -36,7 +36,7 @@ struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); __u64 eventfd_signal(struct eventfd_ctx *ctx); -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask); +__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); @@ -63,8 +63,7 @@ static inline int eventfd_signal(struct eventfd_ctx *ctx) return -ENOSYS; } -static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, - unsigned mask) +static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) { return -ENOSYS; } -- cgit From b7638ad0c7802ea854599ce753d0e6d20690f7e2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:48:25 +0100 Subject: eventfd: make eventfd_signal{_mask}() void No caller care about the return value. Link: https://lore.kernel.org/r/20231122-vfs-eventfd-signal-v2-4-bd549b14ce0c@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/eventfd.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 971943ecb2a6..e32bee4345fb 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -35,8 +35,7 @@ void eventfd_ctx_put(struct eventfd_ctx *ctx); struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); -__u64 eventfd_signal(struct eventfd_ctx *ctx); -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask); +void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); @@ -58,14 +57,8 @@ static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd) return ERR_PTR(-ENOSYS); } -static inline int eventfd_signal(struct eventfd_ctx *ctx) +static inline void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) { - return -ENOSYS; -} - -static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) -{ - return -ENOSYS; } static inline void eventfd_ctx_put(struct eventfd_ctx *ctx) @@ -91,5 +84,10 @@ static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) #endif +static inline void eventfd_signal(struct eventfd_ctx *ctx) +{ + eventfd_signal_mask(ctx, 0); +} + #endif /* _LINUX_EVENTFD_H */ -- cgit From e65a29f0235a438ece414d2d99bbf0d31aa97d04 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:44:37 +0100 Subject: mnt_idmapping: remove check_fsmapping() The helper is a bit pointless. Just open-code the check. Link: https://lore.kernel.org/r/20231122-vfs-mnt_idmap-v1-1-dae4abdde5bd@kernel.org Signed-off-by: Christian Brauner --- include/linux/mnt_idmapping.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index b8da2db4ecd2..cd4d5c8781f5 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -244,7 +244,4 @@ static inline kgid_t mapped_fsgid(struct mnt_idmap *idmap, return from_vfsgid(idmap, fs_userns, VFSGIDT_INIT(current_fsgid())); } -bool check_fsmapping(const struct mnt_idmap *idmap, - const struct super_block *sb); - #endif /* _LINUX_MNT_IDMAPPING_H */ -- cgit From 783822e44594639848b78d4bb61dde26fba04e05 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:44:39 +0100 Subject: mnt_idmapping: decouple from namespaces There's no reason we need to couple mnt idmapping to namespaces in the way we currently do. Copy the idmapping when an idmapped mount is created and don't take any reference on the namespace at all. We also can't easily refcount struct uid_gid_map because it needs to stay the size of a cacheline otherwise we risk performance regressions (Ignoring for a second that right now struct uid_gid_map isn't actually 64 byte but 72 but that's a fix for another patch series.). Link: https://lore.kernel.org/r/20231122-vfs-mnt_idmap-v1-3-dae4abdde5bd@kernel.org Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- include/linux/uidgid.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h index b0542cd11aeb..415a7ca2b882 100644 --- a/include/linux/uidgid.h +++ b/include/linux/uidgid.h @@ -17,6 +17,7 @@ struct user_namespace; extern struct user_namespace init_user_ns; +struct uid_gid_map; typedef struct { uid_t val; @@ -138,6 +139,9 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) return from_kgid(ns, gid) != (gid_t) -1; } +u32 map_id_down(struct uid_gid_map *map, u32 id); +u32 map_id_up(struct uid_gid_map *map, u32 id); + #else static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid) @@ -186,6 +190,15 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) return gid_valid(gid); } +static inline u32 map_id_down(struct uid_gid_map *map, u32 id) +{ + return id; +} + +static inline u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + return id; +} #endif /* CONFIG_USER_NS */ #endif /* _LINUX_UIDGID_H */ -- cgit From 253ca8678d30bcf94410b54476fc1e0f1627a137 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 26 Nov 2023 12:24:38 -0800 Subject: Improve __fget_files_rcu() code generation (and thus __fget_light()) Commit 0ede61d8589c ("file: convert to SLAB_TYPESAFE_BY_RCU") caused a performance regression as reported by the kernel test robot. The __fget_light() function is one of those critical ones for some loads, and the code generation was unnecessarily impacted. Let's just write that function to better. Reported-by: kernel test robot Cc: Christian Brauner Cc: Jann Horn Cc: Mateusz Guzik Closes: https://lore.kernel.org/oe-lkp/202311201406.2022ca3f-oliver.sang@intel.com Signed-off-by: Linus Torvalds Link: https://lore.kernel.org/r/CAHk-=wiCJtLbFWNURB34b9a_R_unaH3CiMRXfkR0-iihB_z68A@mail.gmail.com Signed-off-by: Christian Brauner --- include/linux/fdtable.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index bc4c3287a65e..80bd7789bab1 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -83,12 +83,17 @@ struct dentry; static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = rcu_dereference_raw(files->fdt); - - if (fd < fdt->max_fds) { - fd = array_index_nospec(fd, fdt->max_fds); - return rcu_dereference_raw(fdt->fd[fd]); - } - return NULL; + unsigned long mask = array_index_mask_nospec(fd, fdt->max_fds); + struct file *needs_masking; + + /* + * 'mask' is zero for an out-of-bounds fd, all ones for ok. + * 'fd&mask' is 'fd' for ok, or 0 for out of bounds. + * + * Accessing fdt->fd[0] is ok, but needs masking of the result. + */ + needs_masking = rcu_dereference_raw(fdt->fd[fd&mask]); + return (struct file *)(mask & (unsigned long)needs_masking); } static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd) -- cgit From a88c955fcfb49727d0ed86b47410f6555a8e69e4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 30 Nov 2023 13:49:07 +0100 Subject: file: s/close_fd_get_file()/file_close_fd()/g That really shouldn't have "get" in there as that implies we're bumping the reference count which we don't do at all. We used to but not anmore. Now we're just closing the fd and pick that file from the fdtable without bumping the reference count. Update the wrong documentation while at it. Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-1-e73ca6f4ea83@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/fdtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 80bd7789bab1..78c8326d74ae 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -119,7 +119,7 @@ int iterate_fd(struct files_struct *, unsigned, extern int close_fd(unsigned int fd); extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); -extern struct file *close_fd_get_file(unsigned int fd); +extern struct file *file_close_fd(unsigned int fd); extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, struct files_struct **new_fdp); -- cgit From 372a34e66fb7f95124fadae9c600b231c35696a7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 30 Nov 2023 13:49:09 +0100 Subject: fs: replace f_rcuhead with f_task_work The naming is actively misleading since we switched to SLAB_TYPESAFE_BY_RCU. rcu_head is #define callback_head. Use callback_head directly and rename f_rcuhead to f_task_work. Add comments in there to explain what it's used for. Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-3-e73ca6f4ea83@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e..354fd02e0e11 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -991,8 +991,10 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) */ struct file { union { + /* fput() uses task work when closing and freeing file (default). */ + struct callback_head f_task_work; + /* fput() must use workqueue (most kernel threads). */ struct llist_node f_llist; - struct rcu_head f_rcuhead; unsigned int f_iocb_flags; }; -- cgit From eac9189c96196574a83a553ca5a7543dd9f5fe3e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 30 Nov 2023 13:49:10 +0100 Subject: file: stop exposing receive_fd_user() Not every subsystem needs to have their own specialized helper. Just us the __receive_fd() helper. Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-4-e73ca6f4ea83@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/file.h | 7 ------- include/net/scm.h | 9 +++++++++ 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/file.h b/include/linux/file.h index 6e9099d29343..c0d5219c2852 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -101,13 +101,6 @@ extern int __receive_fd(struct file *file, int __user *ufd, extern int receive_fd(struct file *file, unsigned int o_flags); -static inline int receive_fd_user(struct file *file, int __user *ufd, - unsigned int o_flags) -{ - if (ufd == NULL) - return -EFAULT; - return __receive_fd(file, ufd, o_flags); -} int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags); extern void flush_delayed_fput(void); diff --git a/include/net/scm.h b/include/net/scm.h index e8c76b4be2fe..8aae2468bae0 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -208,5 +209,13 @@ static inline void scm_recv_unix(struct socket *sock, struct msghdr *msg, scm_destroy_cred(scm); } +static inline int scm_recv_one_fd(struct file *f, int __user *ufd, + unsigned int flags) +{ + if (!ufd) + return -EFAULT; + return __receive_fd(f, ufd, flags); +} + #endif /* __LINUX_NET_SCM_H */ -- cgit From 4e94ddfe2aab72139acb8d5372fac9e6c3f3e383 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 30 Nov 2023 13:49:11 +0100 Subject: file: remove __receive_fd() Honestly, there's little value in having a helper with and without that int __user *ufd argument. It's just messy and doesn't really give us anything. Just expose receive_fd() with that argument and get rid of that helper. Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-5-e73ca6f4ea83@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/file.h | 5 +---- include/net/scm.h | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/file.h b/include/linux/file.h index c0d5219c2852..6834a29338c4 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -96,10 +96,7 @@ DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), extern void fd_install(unsigned int fd, struct file *file); -extern int __receive_fd(struct file *file, int __user *ufd, - unsigned int o_flags); - -extern int receive_fd(struct file *file, unsigned int o_flags); +int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags); diff --git a/include/net/scm.h b/include/net/scm.h index 8aae2468bae0..cf68acec4d70 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -214,7 +214,7 @@ static inline int scm_recv_one_fd(struct file *f, int __user *ufd, { if (!ufd) return -EFAULT; - return __receive_fd(f, ufd, flags); + return receive_fd(f, ufd, flags); } #endif /* __LINUX_NET_SCM_H */ -- cgit From 3efdc78fdc21ab82694707eb234ab93f28d13ba8 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 13 Dec 2023 22:44:38 -0800 Subject: fs/proc: show correct device and inode numbers in /proc/pid/maps /proc/pid/maps shows device and inode numbers of vma->vm_file-s. Here is an issue. If a mapped file is on a stackable file system (e.g., overlayfs), vma->vm_file is a backing file whose f_inode is on the underlying filesystem. To show correct numbers, we need to get a user file and shows its numbers. The same trick is used to show file paths in /proc/pid/maps. Cc: Alexander Mikhalitsyn Suggested-by: Amir Goldstein Signed-off-by: Andrei Vagin Link: https://lore.kernel.org/r/20231214064439.1023011-1-avagin@google.com Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- include/linux/fs.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index f171505940ff..a3a48a5d8728 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2523,20 +2523,28 @@ struct file *backing_file_open(const struct path *user_path, int flags, struct path *backing_file_user_path(struct file *f); /* - * file_user_path - get the path to display for memory mapped file - * * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file * stored in ->vm_file is a backing file whose f_inode is on the underlying - * filesystem. When the mapped file path is displayed to user (e.g. via - * /proc//maps), this helper should be used to get the path to display - * to the user, which is the path of the fd that user has requested to map. + * filesystem. When the mapped file path and inode number are displayed to + * user (e.g. via /proc//maps), these helpers should be used to get the + * path and inode number to display to the user, which is the path of the fd + * that user has requested to map and the inode number that would be returned + * by fstat() on that same fd. */ +/* Get the path to display in /proc//maps */ static inline const struct path *file_user_path(struct file *f) { if (unlikely(f->f_mode & FMODE_BACKING)) return backing_file_user_path(f); return &f->f_path; } +/* Get the inode whose inode number to display in /proc//maps */ +static inline const struct inode *file_user_inode(struct file *f) +{ + if (unlikely(f->f_mode & FMODE_BACKING)) + return d_inode(backing_file_user_path(f)->dentry); + return file_inode(f); +} static inline struct file *file_clone_open(struct file *file) { -- cgit