diff options
Diffstat (limited to 'fs/proc')
| -rw-r--r-- | fs/proc/Kconfig | 21 | ||||
| -rw-r--r-- | fs/proc/Makefile | 2 | ||||
| -rw-r--r-- | fs/proc/array.c | 92 | ||||
| -rw-r--r-- | fs/proc/base.c | 377 | ||||
| -rw-r--r-- | fs/proc/bootconfig.c | 6 | ||||
| -rw-r--r-- | fs/proc/cmdline.c | 1 | ||||
| -rw-r--r-- | fs/proc/consoles.c | 7 | ||||
| -rw-r--r-- | fs/proc/fd.c | 104 | ||||
| -rw-r--r-- | fs/proc/fd.h | 2 | ||||
| -rw-r--r-- | fs/proc/generic.c | 78 | ||||
| -rw-r--r-- | fs/proc/inode.c | 94 | ||||
| -rw-r--r-- | fs/proc/internal.h | 132 | ||||
| -rw-r--r-- | fs/proc/interrupts.c | 4 | ||||
| -rw-r--r-- | fs/proc/kcore.c | 224 | ||||
| -rw-r--r-- | fs/proc/meminfo.c | 22 | ||||
| -rw-r--r-- | fs/proc/namespaces.c | 11 | ||||
| -rw-r--r-- | fs/proc/nommu.c | 2 | ||||
| -rw-r--r-- | fs/proc/page.c | 286 | ||||
| -rw-r--r-- | fs/proc/proc_net.c | 6 | ||||
| -rw-r--r-- | fs/proc/proc_sysctl.c | 707 | ||||
| -rw-r--r-- | fs/proc/root.c | 129 | ||||
| -rw-r--r-- | fs/proc/self.c | 12 | ||||
| -rw-r--r-- | fs/proc/softirqs.c | 2 | ||||
| -rw-r--r-- | fs/proc/stat.c | 30 | ||||
| -rw-r--r-- | fs/proc/task_mmu.c | 2073 | ||||
| -rw-r--r-- | fs/proc/task_nommu.c | 101 | ||||
| -rw-r--r-- | fs/proc/thread_self.c | 13 | ||||
| -rw-r--r-- | fs/proc/vmcore.c | 386 |
28 files changed, 3253 insertions, 1671 deletions
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 32b1116ae137..6ae966c561e7 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -32,7 +32,7 @@ config PROC_FS config PROC_KCORE bool "/proc/kcore support" if !ARM depends on PROC_FS && MMU - select CRASH_CORE + select VMCORE_INFO help Provides a virtual ELF core file of the live kernel. This can be read with gdb and other ELF tools. No modifications can be @@ -61,6 +61,25 @@ config PROC_VMCORE_DEVICE_DUMP as ELF notes to /proc/vmcore. You can still disable device dump using the kernel command line option 'novmcoredd'. +config NEED_PROC_VMCORE_DEVICE_RAM + bool + +config PROC_VMCORE_DEVICE_RAM + def_bool y + depends on PROC_VMCORE && NEED_PROC_VMCORE_DEVICE_RAM + depends on VIRTIO_MEM + help + If the elfcore hdr is allocated and prepared by the dump kernel + ("2nd kernel") instead of the crashed kernel, RAM provided by memory + devices such as virtio-mem will not be included in the dump + image, because only the device driver can properly detect them. + + With this config enabled, these RAM ranges will be queried from the + device drivers once the device gets probed, so they can be included + in the crash dump. + + Relevant architectures should select NEED_PROC_VMCORE_DEVICE_RAM. + config PROC_SYSCTL bool "Sysctl support (/proc/sys)" if EXPERT depends on PROC_FS diff --git a/fs/proc/Makefile b/fs/proc/Makefile index bd08616ed8ba..7b4db9c56e6a 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -5,7 +5,7 @@ obj-y += proc.o -CFLAGS_task_mmu.o += $(call cc-option,-Wno-override-init,) +CFLAGS_task_mmu.o += -Wno-override-init proc-y := nommu.o task_nommu.o proc-$(CONFIG_MMU) := task_mmu.o diff --git a/fs/proc/array.c b/fs/proc/array.c index 49283b8103c7..42932f88141a 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -91,6 +91,7 @@ #include <linux/user_namespace.h> #include <linux/fs_struct.h> #include <linux/kthread.h> +#include <linux/mmu_context.h> #include <asm/processor.h> #include "internal.h" @@ -108,7 +109,7 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) else if (p->flags & PF_KTHREAD) get_kthread_comm(tcomm, sizeof(tcomm), p); else - __get_task_comm(tcomm, sizeof(tcomm), p); + get_task_comm(tcomm, p); if (escape) seq_escape_str(m, tcomm, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); @@ -156,13 +157,11 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, unsigned int max_fds = 0; rcu_read_lock(); - ppid = pid_alive(p) ? - task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; - tracer = ptrace_parent(p); if (tracer) tpid = task_pid_nr_ns(tracer, ns); + ppid = task_ppid_nr_ns(p, ns); tgid = task_tgid_nr_ns(p, ns); ngid = task_numa_group_id(p); cred = get_task_cred(p); @@ -219,6 +218,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns)); #endif seq_putc(m, '\n'); + + seq_printf(m, "Kthread:\t%c\n", p->flags & PF_KTHREAD ? '1' : '0'); } void render_sigset_t(struct seq_file *m, const char *header, @@ -300,13 +301,8 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) static void render_cap_t(struct seq_file *m, const char *header, kernel_cap_t *a) { - unsigned __capi; - seq_puts(m, header); - CAP_FOR_EACH_U32(__capi) { - seq_put_hex_ll(m, NULL, - a->cap[CAP_LAST_U32 - __capi], 8); - } + seq_put_hex_ll(m, NULL, a->val, 16); seq_putc(m, '\n'); } @@ -424,10 +420,20 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm) bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE); if (thp_enabled) - thp_enabled = !test_bit(MMF_DISABLE_THP, &mm->flags); + thp_enabled = !mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); seq_printf(m, "THP_enabled:\t%d\n", thp_enabled); } +static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm) +{ + seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm)); +} + +__weak void arch_proc_pid_thread_features(struct seq_file *m, + struct task_struct *task) +{ +} + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -443,6 +449,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_mem(m, mm); task_core_dumping(m, task); task_thp_status(m, mm); + task_untag_mask(m, mm); mmput(mm); } task_sig(m, task); @@ -451,6 +458,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); + arch_proc_pid_thread_features(m, task); return 0; } @@ -467,13 +475,12 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, int permitted; struct mm_struct *mm; unsigned long long start_time; - unsigned long cmin_flt = 0, cmaj_flt = 0; - unsigned long min_flt = 0, maj_flt = 0; - u64 cutime, cstime, utime, stime; - u64 cgtime, gtime; + unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt; + u64 cutime, cstime, cgtime, utime, stime, gtime; unsigned long rsslim = 0; unsigned long flags; int exit_code = task->exit_code; + struct signal_struct *sig = task->signal; state = *get_task_state(task); vsize = eip = esp = 0; @@ -490,7 +497,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, * a program is not able to use ptrace(2) in that case. It is * safe because the task has stopped executing permanently. */ - if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) { + if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE|PF_POSTCOREDUMP))) { if (try_get_task_stack(task)) { eip = KSTK_EIP(task); esp = KSTK_ESP(task); @@ -501,12 +508,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = utime = stime = 0; - cgtime = gtime = 0; if (lock_task_sighand(task, &flags)) { - struct signal_struct *sig = task->signal; - if (sig->tty) { struct pid *pgrp = tty_get_pgrp(sig->tty); tty_pgrp = pid_nr_ns(pgrp, ns); @@ -517,27 +520,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, num_threads = get_nr_threads(task); collect_sigign_sigcatch(task, &sigign, &sigcatch); - cmin_flt = sig->cmin_flt; - cmaj_flt = sig->cmaj_flt; - cutime = sig->cutime; - cstime = sig->cstime; - cgtime = sig->cgtime; rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); - /* add up live thread stats at the group level */ if (whole) { - struct task_struct *t = task; - do { - min_flt += t->min_flt; - maj_flt += t->maj_flt; - gtime += task_gtime(t); - } while_each_thread(task, t); - - min_flt += sig->min_flt; - maj_flt += sig->maj_flt; - thread_group_cputime_adjusted(task, &utime, &stime); - gtime += sig->gtime; - if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED)) exit_code = sig->group_exit_code; } @@ -551,10 +536,37 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (permitted && (!whole || num_threads < 2)) wchan = !task_is_running(task); - if (!whole) { + + scoped_guard(rcu) { + scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { + cmin_flt = sig->cmin_flt; + cmaj_flt = sig->cmaj_flt; + cutime = sig->cutime; + cstime = sig->cstime; + cgtime = sig->cgtime; + + if (whole) { + struct task_struct *t; + + min_flt = sig->min_flt; + maj_flt = sig->maj_flt; + gtime = sig->gtime; + + __for_each_thread(sig, t) { + min_flt += t->min_flt; + maj_flt += t->maj_flt; + gtime += task_gtime(t); + } + } + } + } + + if (whole) { + thread_group_cputime_adjusted(task, &utime, &stime); + } else { + task_cputime_adjusted(task, &utime, &stime); min_flt = task->min_flt; maj_flt = task->maj_flt; - task_cputime_adjusted(task, &utime, &stime); gtime = task_gtime(task); } diff --git a/fs/proc/base.c b/fs/proc/base.c index 9e479d7d202b..4eec684baca9 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -58,7 +58,6 @@ #include <linux/init.h> #include <linux/capability.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/generic-radix-tree.h> #include <linux/string.h> #include <linux/seq_file.h> @@ -85,6 +84,7 @@ #include <linux/elf.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> +#include <linux/fs_parser.h> #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> @@ -96,6 +96,8 @@ #include <linux/time_namespace.h> #include <linux/resctrl.h> #include <linux/cn_proc.h> +#include <linux/ksm.h> +#include <uapi/linux/lsm.h> #include <trace/events/oom.h> #include "internal.h" #include "fd.h" @@ -115,6 +117,40 @@ static u8 nlink_tid __ro_after_init; static u8 nlink_tgid __ro_after_init; +enum proc_mem_force { + PROC_MEM_FORCE_ALWAYS, + PROC_MEM_FORCE_PTRACE, + PROC_MEM_FORCE_NEVER +}; + +static enum proc_mem_force proc_mem_force_override __ro_after_init = + IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER : + IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE : + PROC_MEM_FORCE_ALWAYS; + +static const struct constant_table proc_mem_force_table[] __initconst = { + { "always", PROC_MEM_FORCE_ALWAYS }, + { "ptrace", PROC_MEM_FORCE_PTRACE }, + { "never", PROC_MEM_FORCE_NEVER }, + { } +}; + +static int __init early_proc_mem_force_override(char *buf) +{ + if (!buf) + return -EINVAL; + + /* + * lookup_constant() defaults to proc_mem_force_override to preseve + * the initial Kconfig choice in case an invalid param gets passed. + */ + proc_mem_force_override = lookup_constant(proc_mem_force_table, + buf, proc_mem_force_override); + + return 0; +} +early_param("proc_mem.force_override", early_proc_mem_force_override); + struct pid_entry { const char *name; unsigned int len; @@ -145,10 +181,10 @@ struct pid_entry { NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_single_file_operations, \ { .proc_show = show } ) -#define ATTR(LSM, NAME, MODE) \ +#define ATTR(LSMID, NAME, MODE) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_pid_attr_operations, \ - { .lsm = LSM }) + { .lsmid = LSMID }) /* * Count the number of hardlinks for the pid_entry table, excluding the . @@ -380,7 +416,7 @@ static const struct file_operations proc_pid_cmdline_ops = { #ifdef CONFIG_KALLSYMS /* * Provides a wchan file via kallsyms in a proper one-value-per-file format. - * Returns the resolved symbol. If that fails, simply return the address. + * Returns the resolved symbol to user space. */ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) @@ -685,7 +721,7 @@ static bool proc_fd_access_allowed(struct inode *inode) return allowed; } -int proc_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int proc_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int error; @@ -694,12 +730,11 @@ int proc_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (attr->ia_valid & ATTR_MODE) return -EPERM; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; - setattr_copy(&init_user_ns, inode, attr); - mark_inode_dirty(inode); + setattr_copy(&nop_mnt_idmap, inode, attr); return 0; } @@ -727,7 +762,7 @@ static bool has_pid_permissions(struct proc_fs_info *fs_info, } -static int proc_pid_permission(struct user_namespace *mnt_userns, +static int proc_pid_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); @@ -753,7 +788,7 @@ static int proc_pid_permission(struct user_namespace *mnt_userns, return -EPERM; } - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } @@ -792,23 +827,31 @@ static const struct file_operations proc_single_file_operations = { .release = single_release, }; - +/* + * proc_mem_open() can return errno, NULL or mm_struct*. + * + * - Returns NULL if the task has no mm (PF_KTHREAD or PF_EXITING) + * - Returns mm_struct* on success + * - Returns error code on failure + */ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) { struct task_struct *task = get_proc_task(inode); - struct mm_struct *mm = ERR_PTR(-ESRCH); + struct mm_struct *mm; - if (task) { - mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); - put_task_struct(task); + if (!task) + return ERR_PTR(-ESRCH); - if (!IS_ERR_OR_NULL(mm)) { - /* ensure this mm_struct can't be freed */ - mmgrab(mm); - /* but do not pin its memory */ - mmput(mm); - } - } + mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); + put_task_struct(task); + + if (IS_ERR(mm)) + return mm == ERR_PTR(-ESRCH) ? NULL : mm; + + /* ensure this mm_struct can't be freed */ + mmgrab(mm); + /* but do not pin its memory */ + mmput(mm); return mm; } @@ -817,8 +860,8 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) { struct mm_struct *mm = proc_mem_open(inode, mode); - if (IS_ERR(mm)) - return PTR_ERR(mm); + if (IS_ERR_OR_NULL(mm)) + return mm ? PTR_ERR(mm) : -ESRCH; file->private_data = mm; return 0; @@ -826,12 +869,31 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) static int mem_open(struct inode *inode, struct file *file) { - int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH); - - /* OK to pass negative loff_t, we can catch out-of-range */ - file->f_mode |= FMODE_UNSIGNED_OFFSET; + if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET))) + return -EINVAL; + return __mem_open(inode, file, PTRACE_MODE_ATTACH); +} - return ret; +static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm) +{ + struct task_struct *task; + bool ptrace_active = false; + + switch (proc_mem_force_override) { + case PROC_MEM_FORCE_NEVER: + return false; + case PROC_MEM_FORCE_PTRACE: + task = get_proc_task(file_inode(file)); + if (task) { + ptrace_active = READ_ONCE(task->ptrace) && + READ_ONCE(task->mm) == mm && + READ_ONCE(task->parent) == current; + put_task_struct(task); + } + return ptrace_active; + default: + return true; + } } static ssize_t mem_rw(struct file *file, char __user *buf, @@ -854,7 +916,9 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!mmget_not_zero(mm)) goto free; - flags = FOLL_FORCE | (write ? FOLL_WRITE : 0); + flags = write ? FOLL_WRITE : 0; + if (proc_mem_foll_force(file, mm)) + flags |= FOLL_FORCE; while (count > 0) { size_t this_len = min_t(size_t, count, PAGE_SIZE); @@ -931,6 +995,7 @@ static const struct file_operations proc_mem_operations = { .write = mem_write, .open = mem_open, .release = mem_release, + .fop_flags = FOP_UNSIGNED_OFFSET, }; static int environ_open(struct inode *inode, struct file *file) @@ -1098,7 +1163,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) struct task_struct *p = find_lock_task_mm(task); if (p) { - if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) { + if (mm_flags_test(MMF_MULTIPROCESS, p->mm)) { mm = p->mm; mmgrab(mm); } @@ -1153,11 +1218,10 @@ err_unlock: static ssize_t oom_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; int oom_adj; int err; - memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) { @@ -1213,11 +1277,10 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; int oom_score_adj; int err; - memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) { @@ -1358,13 +1421,13 @@ static ssize_t proc_fault_inject_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; int make_it_fail; int rv; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) @@ -1432,7 +1495,6 @@ static const struct file_operations proc_fail_nth_operations = { #endif -#ifdef CONFIG_SCHED_DEBUG /* * Print out various scheduling related per-task fields: */ @@ -1482,8 +1544,6 @@ static const struct file_operations proc_pid_sched_operations = { .release = single_release, }; -#endif - #ifdef CONFIG_SCHED_AUTOGROUP /* * Print out autogroup related information: @@ -1509,11 +1569,10 @@ sched_autogroup_write(struct file *file, const char __user *buf, { struct inode *inode = file_inode(file); struct task_struct *p; - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; int nice; int err; - memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) @@ -1666,10 +1725,9 @@ static ssize_t comm_write(struct file *file, const char __user *buf, { struct inode *inode = file_inode(file); struct task_struct *p; - char buffer[TASK_COMM_LEN]; + char buffer[TASK_COMM_LEN] = {}; const size_t maxlen = sizeof(buffer) - 1; - memset(buffer, 0, sizeof(buffer)); if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) return -EFAULT; @@ -1881,8 +1939,6 @@ void proc_pid_evict_inode(struct proc_inode *ei) hlist_del_init_rcu(&ei->sibling_inodes); spin_unlock(&pid->lock); } - - put_pid(pid); } struct inode *proc_pid_make_inode(struct super_block *sb, @@ -1902,7 +1958,7 @@ struct inode *proc_pid_make_inode(struct super_block *sb, ei = PROC_I(inode); inode->i_mode = mode; inode->i_ino = get_next_ino(); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_op = &proc_def_inode_operations; /* @@ -1959,14 +2015,14 @@ static struct inode *proc_pid_make_base_inode(struct super_block *sb, return inode; } -int pid_getattr(struct user_namespace *mnt_userns, const struct path *path, +int pid_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; @@ -2005,7 +2061,8 @@ void pid_update_inode(struct task_struct *task, struct inode *inode) * performed a setuid(), etc. * */ -static int pid_revalidate(struct dentry *dentry, unsigned int flags) +static int pid_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; @@ -2070,7 +2127,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, unsigned type = DT_UNKNOWN; ino_t ino = 1; - child = d_hash_and_lookup(dir, &qname); + child = try_lookup_noperm(&qname, dir); if (!child) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); @@ -2138,7 +2195,8 @@ static int dname_to_vma_addr(struct dentry *dentry, return 0; } -static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) +static int map_files_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { unsigned long vm_start, vm_end; bool exact_vma_exists = false; @@ -2156,7 +2214,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) goto out_notask; mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); - if (IS_ERR_OR_NULL(mm)) + if (IS_ERR(mm)) goto out; if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { @@ -2218,7 +2276,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) rc = -ENOENT; vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { - *path = vma->vm_file->f_path; + *path = *file_user_path(vma->vm_file); path_get(path); rc = 0; } @@ -2281,8 +2339,8 @@ proc_map_files_instantiate(struct dentry *dentry, inode->i_op = &proc_map_files_link_inode_operations; inode->i_size = 64; - d_set_d_op(dentry, &tid_map_files_dentry_operations); - return d_splice_alias(inode, dentry); + return proc_splice_unmountable(inode, dentry, + &tid_map_files_dentry_operations); } static struct dentry *proc_map_files_lookup(struct inode *dir, @@ -2442,11 +2500,9 @@ static const struct file_operations proc_map_files_operations = { #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) struct timers_private { - struct pid *pid; - struct task_struct *task; - struct sighand_struct *sighand; - struct pid_namespace *ns; - unsigned long flags; + struct pid *pid; + struct task_struct *task; + struct pid_namespace *ns; }; static void *timers_start(struct seq_file *m, loff_t *pos) @@ -2457,54 +2513,48 @@ static void *timers_start(struct seq_file *m, loff_t *pos) if (!tp->task) return ERR_PTR(-ESRCH); - tp->sighand = lock_task_sighand(tp->task, &tp->flags); - if (!tp->sighand) - return ERR_PTR(-ESRCH); - - return seq_list_start(&tp->task->signal->posix_timers, *pos); + rcu_read_lock(); + return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos); } static void *timers_next(struct seq_file *m, void *v, loff_t *pos) { struct timers_private *tp = m->private; - return seq_list_next(v, &tp->task->signal->posix_timers, pos); + + return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos); } static void timers_stop(struct seq_file *m, void *v) { struct timers_private *tp = m->private; - if (tp->sighand) { - unlock_task_sighand(tp->task, &tp->flags); - tp->sighand = NULL; - } - if (tp->task) { put_task_struct(tp->task); tp->task = NULL; + rcu_read_unlock(); } } static int show_timer(struct seq_file *m, void *v) { - struct k_itimer *timer; - struct timers_private *tp = m->private; - int notify; static const char * const nstr[] = { - [SIGEV_SIGNAL] = "signal", - [SIGEV_NONE] = "none", - [SIGEV_THREAD] = "thread", + [SIGEV_SIGNAL] = "signal", + [SIGEV_NONE] = "none", + [SIGEV_THREAD] = "thread", }; - timer = list_entry((struct list_head *)v, struct k_itimer, list); - notify = timer->it_sigev_notify; + struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); + struct timers_private *tp = m->private; + int notify = timer->it_sigev_notify; + + guard(spinlock_irq)(&timer->it_lock); + if (!posixtimer_valid(timer)) + return 0; seq_printf(m, "ID: %d\n", timer->it_id); - seq_printf(m, "signal: %d/%px\n", - timer->sigq->info.si_signo, - timer->sigq->info.si_value.sival_ptr); - seq_printf(m, "notify: %s/%s.%d\n", - nstr[notify & ~SIGEV_THREAD_ID], + seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo, + timer->sigq.info.si_value.sival_ptr); + seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID], (notify & SIGEV_THREAD_ID) ? "tid" : "pid", pid_nr_ns(timer->it_pid, tp->ns)); seq_printf(m, "ClockID: %d\n", timer->it_clock); @@ -2574,10 +2624,11 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, } task_lock(p); - if (slack_ns == 0) - p->timer_slack_ns = p->default_timer_slack_ns; - else - p->timer_slack_ns = slack_ns; + if (rt_or_dl_task_policy(p)) + slack_ns = 0; + else if (slack_ns == 0) + slack_ns = p->default_timer_slack_ns; + p->timer_slack_ns = slack_ns; task_unlock(p); out: @@ -2653,8 +2704,7 @@ static struct dentry *proc_pident_instantiate(struct dentry *dentry, inode->i_fop = p->fop; ei->op = p->op; pid_update_inode(task, inode); - d_set_d_op(dentry, &pid_dentry_operations); - return d_splice_alias(inode, dentry); + return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } static struct dentry *proc_pident_lookup(struct inode *dir, @@ -2730,7 +2780,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, if (!task) return -ESRCH; - length = security_getprocattr(task, PROC_I(inode)->op.lsm, + length = security_getprocattr(task, PROC_I(inode)->op.lsmid, file->f_path.dentry->d_name.name, &p); put_task_struct(task); @@ -2788,7 +2838,7 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, if (rv < 0) goto out_free; - rv = security_setprocattr(PROC_I(inode)->op.lsm, + rv = security_setprocattr(PROC_I(inode)->op.lsmid, file->f_path.dentry->d_name.name, page, count); mutex_unlock(¤t->signal->cred_guard_mutex); @@ -2817,7 +2867,7 @@ static int proc_##LSM##_attr_dir_iterate(struct file *filp, \ \ static const struct file_operations proc_##LSM##_attr_dir_ops = { \ .read = generic_read_dir, \ - .iterate = proc_##LSM##_attr_dir_iterate, \ + .iterate_shared = proc_##LSM##_attr_dir_iterate, \ .llseek = default_llseek, \ }; \ \ @@ -2837,27 +2887,27 @@ static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \ #ifdef CONFIG_SECURITY_SMACK static const struct pid_entry smack_attr_dir_stuff[] = { - ATTR("smack", "current", 0666), + ATTR(LSM_ID_SMACK, "current", 0666), }; LSM_DIR_OPS(smack); #endif #ifdef CONFIG_SECURITY_APPARMOR static const struct pid_entry apparmor_attr_dir_stuff[] = { - ATTR("apparmor", "current", 0666), - ATTR("apparmor", "prev", 0444), - ATTR("apparmor", "exec", 0666), + ATTR(LSM_ID_APPARMOR, "current", 0666), + ATTR(LSM_ID_APPARMOR, "prev", 0444), + ATTR(LSM_ID_APPARMOR, "exec", 0666), }; LSM_DIR_OPS(apparmor); #endif static const struct pid_entry attr_dir_stuff[] = { - ATTR(NULL, "current", 0666), - ATTR(NULL, "prev", 0444), - ATTR(NULL, "exec", 0666), - ATTR(NULL, "fscreate", 0666), - ATTR(NULL, "keycreate", 0666), - ATTR(NULL, "sockcreate", 0666), + ATTR(LSM_ID_UNDEF, "current", 0666), + ATTR(LSM_ID_UNDEF, "prev", 0444), + ATTR(LSM_ID_UNDEF, "exec", 0666), + ATTR(LSM_ID_UNDEF, "fscreate", 0666), + ATTR(LSM_ID_UNDEF, "keycreate", 0666), + ATTR(LSM_ID_UNDEF, "sockcreate", 0666), #ifdef CONFIG_SECURITY_SMACK DIR("smack", 0555, proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops), @@ -2912,8 +2962,10 @@ static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, ret = 0; mm = get_task_mm(task); if (mm) { + unsigned long flags = __mm_flags_get_dumpable(mm); + len = snprintf(buffer, sizeof(buffer), "%08lx\n", - ((mm->flags & MMF_DUMP_FILTER_MASK) >> + ((flags & MMF_DUMP_FILTER_MASK) >> MMF_DUMP_FILTER_SHIFT)); mmput(mm); ret = simple_read_from_buffer(buf, count, ppos, buffer, len); @@ -2952,9 +3004,9 @@ static ssize_t proc_coredump_filter_write(struct file *file, for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) { if (val & mask) - set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + mm_flags_set(i + MMF_DUMP_FILTER_SHIFT, mm); else - clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + mm_flags_clear(i + MMF_DUMP_FILTER_SHIFT, mm); } mmput(mm); @@ -2976,8 +3028,7 @@ static const struct file_operations proc_coredump_filter_operations = { #ifdef CONFIG_TASK_IO_ACCOUNTING static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole) { - struct task_io_accounting acct = task->ioac; - unsigned long flags; + struct task_io_accounting acct; int result; result = down_read_killable(&task->signal->exec_update_lock); @@ -2989,15 +3040,21 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh goto out_unlock; } - if (whole && lock_task_sighand(task, &flags)) { - struct task_struct *t = task; + if (whole) { + struct signal_struct *sig = task->signal; + struct task_struct *t; - task_io_accounting_add(&acct, &task->signal->ioac); - while_each_thread(task, t) - task_io_accounting_add(&acct, &t->ioac); + guard(rcu)(); + scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { + acct = sig->ioac; + __for_each_thread(sig, t) + task_io_accounting_add(&acct, &t->ioac); - unlock_task_sighand(task, &flags); + } + } else { + acct = task->ioac; } + seq_printf(m, "rchar: %llu\n" "wchar: %llu\n" @@ -3203,10 +3260,24 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct mm_struct *mm; + int ret = 0; mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm)); + seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); + seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); + seq_printf(m, "ksm_merge_any: %s\n", + mm_flags_test(MMF_VM_MERGE_ANY, mm) ? "yes" : "no"); + ret = mmap_read_lock_killable(mm); + if (ret) { + mmput(mm); + return ret; + } + seq_printf(m, "ksm_mergeable: %s\n", + ksm_process_mergeable(mm) ? "yes" : "no"); + mmap_read_unlock(mm); mmput(mm); } @@ -3214,7 +3285,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, } #endif /* CONFIG_KSM */ -#ifdef CONFIG_STACKLEAK_METRICS +#ifdef CONFIG_KSTACK_ERASE_METRICS static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -3227,7 +3298,7 @@ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, prev_depth, depth); return 0; } -#endif /* CONFIG_STACKLEAK_METRICS */ +#endif /* CONFIG_KSTACK_ERASE_METRICS */ /* * Thread groups @@ -3249,9 +3320,7 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), -#ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), -#endif #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif @@ -3336,7 +3405,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif -#ifdef CONFIG_STACKLEAK_METRICS +#ifdef CONFIG_KSTACK_ERASE_METRICS ONE("stack_depth", S_IRUGO, proc_stack_depth), #endif #ifdef CONFIG_PROC_PID_ARCH_STATUS @@ -3426,8 +3495,7 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry, set_nlink(inode, nlink_tgid); pid_update_inode(task, inode); - d_set_d_op(dentry, &pid_dentry_operations); - return d_splice_alias(inode, dentry); + return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) @@ -3510,14 +3578,12 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) return 0; if (pos == TGID_OFFSET - 2) { - struct inode *inode = d_inode(fs_info->proc_self); - if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) + if (!dir_emit(ctx, "self", 4, self_inum, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } if (pos == TGID_OFFSET - 1) { - struct inode *inode = d_inode(fs_info->proc_thread_self); - if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) + if (!dir_emit(ctx, "thread-self", 11, thread_self_inum, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } @@ -3557,7 +3623,7 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) * This function makes sure that the node is always accessible for members of * same thread group. */ -static int proc_tid_comm_permission(struct user_namespace *mnt_userns, +static int proc_tid_comm_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { bool is_same_tgroup; @@ -3577,11 +3643,12 @@ static int proc_tid_comm_permission(struct user_namespace *mnt_userns, return 0; } - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } static const struct inode_operations proc_tid_comm_inode_operations = { - .permission = proc_tid_comm_permission, + .setattr = proc_setattr, + .permission = proc_tid_comm_permission, }; /* @@ -3599,9 +3666,7 @@ static const struct pid_entry tid_base_stuff[] = { ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), -#ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), -#endif NOD("comm", S_IFREG|S_IRUGO|S_IWUSR, &proc_tid_comm_inode_operations, &proc_pid_set_comm_operations, {}), @@ -3730,8 +3795,7 @@ static struct dentry *proc_task_instantiate(struct dentry *dentry, set_nlink(inode, nlink_tid); pid_update_inode(task, inode); - d_set_d_op(dentry, &pid_dentry_operations); - return d_splice_alias(inode, dentry); + return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) @@ -3811,11 +3875,10 @@ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos, /* If we haven't found our starting place yet start * with the leader and walk nr threads forward. */ - pos = task = task->group_leader; - do { + for_each_thread(task, pos) { if (!nr--) goto found; - } while_each_thread(task, pos); + } fail: pos = NULL; goto out; @@ -3837,10 +3900,8 @@ static struct task_struct *next_tid(struct task_struct *start) struct task_struct *pos = NULL; rcu_read_lock(); if (pid_alive(start)) { - pos = next_thread(start); - if (thread_group_leader(pos)) - pos = NULL; - else + pos = __next_thread(start); + if (pos) get_task_struct(pos); } rcu_read_unlock(); @@ -3862,12 +3923,12 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit_dots(file, ctx)) return 0; - /* f_version caches the tgid value that the last readdir call couldn't - * return. lseek aka telldir automagically resets f_version to 0. + /* We cache the tgid value that the last readdir call couldn't + * return and lseek resets it to 0. */ ns = proc_pid_ns(inode->i_sb); - tid = (int)file->f_version; - file->f_version = 0; + tid = (int)(intptr_t)file->private_data; + file->private_data = NULL; for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; task = next_tid(task), ctx->pos++) { @@ -3877,12 +3938,12 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) tid = task_pid_nr_ns(task, ns); if (!tid) continue; /* The task has just exited. */ - len = snprintf(name, sizeof(name), "%u", tid); + len = snprintf(name, sizeof(name), "%d", tid); if (!proc_fill_cache(file, ctx, name, len, proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first * pid for the next readir call */ - file->f_version = (u64)tid; + file->private_data = (void *)(intptr_t)tid; put_task_struct(task); break; } @@ -3891,13 +3952,13 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) return 0; } -static int proc_task_getattr(struct user_namespace *mnt_userns, +static int proc_task_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct task_struct *p = get_proc_task(inode); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (p) { stat->nlink += get_nr_threads(p); @@ -3907,6 +3968,24 @@ static int proc_task_getattr(struct user_namespace *mnt_userns, return 0; } +/* + * proc_task_readdir() set @file->private_data to a positive integer + * value, so casting that to u64 is safe. generic_llseek_cookie() will + * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is + * here to catch any unexpected change in behavior either in + * proc_task_readdir() or generic_llseek_cookie(). + */ +static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence) +{ + u64 cookie = (u64)(intptr_t)file->private_data; + loff_t off; + + off = generic_llseek_cookie(file, offset, whence, &cookie); + WARN_ON_ONCE(cookie > INT_MAX); + file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */ + return off; +} + static const struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, @@ -3917,7 +3996,7 @@ static const struct inode_operations proc_task_inode_operations = { static const struct file_operations proc_task_operations = { .read = generic_read_dir, .iterate_shared = proc_task_readdir, - .llseek = generic_file_llseek, + .llseek = proc_dir_llseek, }; void __init set_proc_pid_nlink(void) diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index 2e244ada1f97..87dcaae32ff8 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -63,6 +63,12 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size) dst += ret; } } + if (cmdline_has_extra_options() && ret >= 0 && boot_command_line[0]) { + ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n", + boot_command_line); + if (ret > 0) + dst += ret; + } out: kfree(key); diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index 91fe1597af7b..a6f76121955f 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -17,6 +17,7 @@ static int __init proc_cmdline_init(void) struct proc_dir_entry *pde; pde = proc_create_single("cmdline", 0, NULL, cmdline_proc_show); + pde_make_permanent(pde); pde->size = saved_command_line_len + 1; return 0; } diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index e0758fe7936d..b7cab1ad990d 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -21,6 +21,7 @@ static int show_console_dev(struct seq_file *m, void *v) { CON_ENABLED, 'E' }, { CON_CONSDEV, 'C' }, { CON_BOOT, 'B' }, + { CON_NBCON, 'N' }, { CON_PRINTBUFFER, 'p' }, { CON_BRL, 'b' }, { CON_ANYTIME, 'a' }, @@ -58,8 +59,8 @@ static int show_console_dev(struct seq_file *m, void *v) seq_printf(m, "%s%d", con->name, con->index); seq_pad(m, ' '); seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', - con->write ? 'W' : '-', con->unblank ? 'U' : '-', - flags); + ((con->flags & CON_NBCON) || con->write) ? 'W' : '-', + con->unblank ? 'U' : '-', flags); if (dev) seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); @@ -68,6 +69,7 @@ static int show_console_dev(struct seq_file *m, void *v) } static void *c_start(struct seq_file *m, loff_t *pos) + __acquires(&console_mutex) { struct console *con; loff_t off = 0; @@ -94,6 +96,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos) } static void c_stop(struct seq_file *m, void *v) + __releases(&console_mutex) { console_list_unlock(); } diff --git a/fs/proc/fd.c b/fs/proc/fd.c index fc46d6fe080c..9eeccff49b2a 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -12,6 +12,7 @@ #include <linux/file.h> #include <linux/seq_file.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/proc_fs.h> @@ -38,10 +39,8 @@ static int seq_show(struct seq_file *m, void *v) spin_lock(&files->file_lock); file = files_lookup_fd_locked(files, fd); if (file) { - struct fdtable *fdt = files_fdtable(files); - f_flags = file->f_flags; - if (close_on_exec(fd, fdt)) + if (close_on_exec(fd, files)) f_flags |= O_CLOEXEC; get_file(file); @@ -60,7 +59,7 @@ static int seq_show(struct seq_file *m, void *v) real_mount(file->f_path.mnt)->mnt_id, file_inode(file)->i_ino); - /* show_fd_locks() never deferences files so a stale value is safe */ + /* show_fd_locks() never dereferences files, so a stale value is safe */ show_fd_locks(m, file, files); if (seq_has_overflowed(m)) goto out; @@ -73,7 +72,18 @@ out: return 0; } -static int proc_fdinfo_access_allowed(struct inode *inode) +static int seq_fdinfo_open(struct inode *inode, struct file *file) +{ + return single_open(file, seq_show, inode); +} + +/* + * Shared /proc/pid/fdinfo and /proc/pid/fdinfo/fd permission helper to ensure + * that the current task has PTRACE_MODE_READ in addition to the normal + * POSIX-like checks. + */ +static int proc_fdinfo_permission(struct mnt_idmap *idmap, struct inode *inode, + int mask) { bool allowed = false; struct task_struct *task = get_proc_task(inode); @@ -87,18 +97,13 @@ static int proc_fdinfo_access_allowed(struct inode *inode) if (!allowed) return -EACCES; - return 0; + return generic_permission(idmap, inode, mask); } -static int seq_fdinfo_open(struct inode *inode, struct file *file) -{ - int ret = proc_fdinfo_access_allowed(inode); - - if (ret) - return ret; - - return single_open(file, seq_show, inode); -} +static const struct inode_operations proc_fdinfo_file_inode_operations = { + .permission = proc_fdinfo_permission, + .setattr = proc_setattr, +}; static const struct file_operations proc_fdinfo_file_operations = { .open = seq_fdinfo_open, @@ -111,11 +116,11 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) { struct file *file; - rcu_read_lock(); - file = task_lookup_fd_rcu(task, fd); - if (file) + file = fget_task(task, fd); + if (file) { *mode = file->f_mode; - rcu_read_unlock(); + fput(file); + } return !!file; } @@ -135,7 +140,8 @@ static void tid_fd_update_inode(struct task_struct *task, struct inode *inode, security_task_to_inode(task, inode); } -static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) +static int tid_fd_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct task_struct *task; struct inode *inode; @@ -213,8 +219,8 @@ static struct dentry *proc_fd_instantiate(struct dentry *dentry, ei->op.proc_get_link = proc_fd_link; tid_fd_update_inode(task, inode, data->mode); - d_set_d_op(dentry, &tid_fd_dentry_operations); - return d_splice_alias(inode, dentry); + return proc_splice_unmountable(inode, dentry, + &tid_fd_dentry_operations); } static struct dentry *proc_lookupfd_common(struct inode *dir, @@ -251,30 +257,27 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, if (!dir_emit_dots(file, ctx)) goto out; - rcu_read_lock(); for (fd = ctx->pos - 2;; fd++) { struct file *f; struct fd_data data; char name[10 + 1]; unsigned int len; - f = task_lookup_next_fd_rcu(p, &fd); + f = fget_task_next(p, &fd); ctx->pos = fd + 2LL; if (!f) break; data.mode = f->f_mode; - rcu_read_unlock(); + fput(f); data.fd = fd; len = snprintf(name, sizeof(name), "%u", fd); if (!proc_fill_cache(file, ctx, name, len, instantiate, p, &data)) - goto out; + break; cond_resched(); - rcu_read_lock(); } - rcu_read_unlock(); out: put_task_struct(p); return 0; @@ -304,14 +307,14 @@ static int proc_readfd_count(struct inode *inode, loff_t *count) return 0; } -static int proc_readfd(struct file *file, struct dir_context *ctx) +static int proc_fd_iterate(struct file *file, struct dir_context *ctx) { return proc_readfd_common(file, ctx, proc_fd_instantiate); } const struct file_operations proc_fd_operations = { .read = generic_read_dir, - .iterate_shared = proc_readfd, + .iterate_shared = proc_fd_iterate, .llseek = generic_file_llseek, }; @@ -325,13 +328,13 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). */ -int proc_fd_permission(struct user_namespace *mnt_userns, +int proc_fd_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct task_struct *p; int rv; - rv = generic_permission(&init_user_ns, inode, mask); + rv = generic_permission(&nop_mnt_idmap, inode, mask); if (rv == 0) return rv; @@ -344,23 +347,14 @@ int proc_fd_permission(struct user_namespace *mnt_userns, return rv; } -static int proc_fd_getattr(struct user_namespace *mnt_userns, +static int proc_fd_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - int rv = 0; - - generic_fillattr(&init_user_ns, inode, stat); - - /* If it's a directory, put the number of open fds there */ - if (S_ISDIR(inode->i_mode)) { - rv = proc_readfd_count(inode, &stat->size); - if (rv < 0) - return rv; - } - return rv; + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + return proc_readfd_count(inode, &stat->size); } const struct inode_operations proc_fd_inode_operations = { @@ -384,11 +378,13 @@ static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry, ei = PROC_I(inode); ei->fd = data->fd; + inode->i_op = &proc_fdinfo_file_inode_operations; + inode->i_fop = &proc_fdinfo_file_operations; tid_fd_update_inode(task, inode, 0); - d_set_d_op(dentry, &tid_fd_dentry_operations); - return d_splice_alias(inode, dentry); + return proc_splice_unmountable(inode, dentry, + &tid_fd_dentry_operations); } static struct dentry * @@ -397,30 +393,20 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags) return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); } -static int proc_readfdinfo(struct file *file, struct dir_context *ctx) +static int proc_fdinfo_iterate(struct file *file, struct dir_context *ctx) { return proc_readfd_common(file, ctx, proc_fdinfo_instantiate); } -static int proc_open_fdinfo(struct inode *inode, struct file *file) -{ - int ret = proc_fdinfo_access_allowed(inode); - - if (ret) - return ret; - - return 0; -} - const struct inode_operations proc_fdinfo_inode_operations = { .lookup = proc_lookupfdinfo, + .permission = proc_fdinfo_permission, .setattr = proc_setattr, }; const struct file_operations proc_fdinfo_operations = { - .open = proc_open_fdinfo, .read = generic_read_dir, - .iterate_shared = proc_readfdinfo, + .iterate_shared = proc_fdinfo_iterate, .llseek = generic_file_llseek, }; diff --git a/fs/proc/fd.h b/fs/proc/fd.h index c5a921a06a0b..7e7265f7e06f 100644 --- a/fs/proc/fd.h +++ b/fs/proc/fd.h @@ -10,7 +10,7 @@ extern const struct inode_operations proc_fd_inode_operations; extern const struct file_operations proc_fdinfo_operations; extern const struct inode_operations proc_fdinfo_inode_operations; -extern int proc_fd_permission(struct user_namespace *mnt_userns, +extern int proc_fd_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); static inline unsigned int proc_fd(struct inode *inode) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 587b91d9d998..501889856461 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -115,26 +115,25 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, return true; } -static int proc_notify_change(struct user_namespace *mnt_userns, +static int proc_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct proc_dir_entry *de = PDE(inode); int error; - error = setattr_prepare(&init_user_ns, dentry, iattr); + error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (error) return error; - setattr_copy(&init_user_ns, inode, iattr); - mark_inode_dirty(inode); + setattr_copy(&nop_mnt_idmap, inode, iattr); proc_set_user(de, inode->i_uid, inode->i_gid); de->mode = inode->i_mode; return 0; } -static int proc_getattr(struct user_namespace *mnt_userns, +static int proc_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -147,7 +146,7 @@ static int proc_getattr(struct user_namespace *mnt_userns, } } - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } @@ -203,8 +202,8 @@ int proc_alloc_inum(unsigned int *inum) { int i; - i = ida_simple_get(&proc_inum_ida, 0, UINT_MAX - PROC_DYNAMIC_FIRST + 1, - GFP_KERNEL); + i = ida_alloc_max(&proc_inum_ida, UINT_MAX - PROC_DYNAMIC_FIRST, + GFP_KERNEL); if (i < 0) return i; @@ -214,10 +213,11 @@ int proc_alloc_inum(unsigned int *inum) void proc_free_inum(unsigned int inum) { - ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); + ida_free(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); } -static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags) +static int proc_misc_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; @@ -254,8 +254,11 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, de->proc_dops); - return d_splice_alias(inode, dentry); + if (de->flags & PROC_ENTRY_FORCE_LOOKUP) + return d_splice_alias_ops(inode, dentry, + &proc_net_dentry_ops); + return d_splice_alias_ops(inode, dentry, + &proc_misc_dentry_ops); } read_unlock(&proc_subdir_lock); return ERR_PTR(-ENOENT); @@ -344,7 +347,8 @@ static const struct file_operations proc_dir_operations = { .iterate_shared = proc_readdir, }; -static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags) +static int proc_net_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { return 0; } @@ -363,6 +367,25 @@ static const struct inode_operations proc_dir_inode_operations = { .setattr = proc_notify_change, }; +static void pde_set_flags(struct proc_dir_entry *pde) +{ + const struct proc_ops *proc_ops = pde->proc_ops; + + if (!proc_ops) + return; + + if (proc_ops->proc_flags & PROC_ENTRY_PERMANENT) + pde->flags |= PROC_ENTRY_PERMANENT; + if (proc_ops->proc_read_iter) + pde->flags |= PROC_ENTRY_proc_read_iter; +#ifdef CONFIG_COMPAT + if (proc_ops->proc_compat_ioctl) + pde->flags |= PROC_ENTRY_proc_compat_ioctl; +#endif + if (proc_ops->proc_lseek) + pde->flags |= PROC_ENTRY_proc_lseek; +} + /* returns the registered entry, or frees dp and returns NULL on failure */ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, struct proc_dir_entry *dp) @@ -370,6 +393,9 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, if (proc_alloc_inum(&dp->low_ino)) goto out_free_entry; + if (!S_ISDIR(dp->mode)) + pde_set_flags(dp); + write_lock(&proc_subdir_lock); dp->parent = dir; if (pde_subdir_insert(dir, dp) == false) { @@ -447,9 +473,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, INIT_LIST_HEAD(&ent->pde_openers); proc_set_user(ent, (*parent)->uid, (*parent)->gid); - ent->proc_dops = &proc_misc_dentry_ops; /* Revalidate everything under /proc/${pid}/net */ - if ((*parent)->proc_dops == &proc_net_dentry_ops) + if ((*parent)->flags & PROC_ENTRY_FORCE_LOOKUP) pde_force_lookup(ent); out: @@ -465,9 +490,9 @@ struct proc_dir_entry *proc_symlink(const char *name, (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1); if (ent) { - ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL); + ent->size = strlen(dest); + ent->data = kmemdup(dest, ent->size + 1, GFP_KERNEL); if (ent->data) { - strcpy((char*)ent->data,dest); ent->proc_iops = &proc_link_inode_operations; ent = proc_register(parent, ent); } else { @@ -558,12 +583,6 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, return p; } -static inline void pde_set_flags(struct proc_dir_entry *pde) -{ - if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT) - pde->flags |= PROC_ENTRY_PERMANENT; -} - struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops, void *data) @@ -574,7 +593,6 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, if (!p) return NULL; p->proc_ops = proc_ops; - pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_data); @@ -680,6 +698,12 @@ void pde_put(struct proc_dir_entry *pde) } } +static void pde_erase(struct proc_dir_entry *pde, struct proc_dir_entry *parent) +{ + rb_erase(&pde->subdir_node, &parent->subdir); + RB_CLEAR_NODE(&pde->subdir_node); +} + /* * Remove a /proc entry and free it if it's not currently in use. */ @@ -702,7 +726,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) WARN(1, "removing permanent /proc entry '%s'", de->name); de = NULL; } else { - rb_erase(&de->subdir_node, &parent->subdir); + pde_erase(de, parent); if (S_ISDIR(de->mode)) parent->nlink--; } @@ -746,7 +770,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) root->parent->name, root->name); return -EINVAL; } - rb_erase(&root->subdir_node, &parent->subdir); + pde_erase(root, parent); de = root; while (1) { @@ -758,7 +782,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) next->parent->name, next->name); return -EINVAL; } - rb_erase(&next->subdir_node, &de->subdir); + pde_erase(next, de); de = next; continue; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index f495fdb39151..b7634f975d98 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -30,7 +30,6 @@ static void proc_evict_inode(struct inode *inode) { - struct proc_dir_entry *de; struct ctl_table_header *head; struct proc_inode *ei = PROC_I(inode); @@ -38,21 +37,12 @@ static void proc_evict_inode(struct inode *inode) clear_inode(inode); /* Stop tracking associated processes */ - if (ei->pid) { + if (ei->pid) proc_pid_evict_inode(ei); - ei->pid = NULL; - } - - /* Let go of any associated proc directory entry */ - de = ei->pde; - if (de) { - pde_put(de); - ei->pde = NULL; - } head = ei->sysctl; if (head) { - RCU_INIT_POINTER(ei->sysctl, NULL); + WRITE_ONCE(ei->sysctl, NULL); proc_sys_evict_inode(inode, head); } } @@ -80,6 +70,13 @@ static struct inode *proc_alloc_inode(struct super_block *sb) static void proc_free_inode(struct inode *inode) { + struct proc_inode *ei = PROC_I(inode); + + if (ei->pid) + put_pid(ei->pid); + /* Let go of any associated proc directory entry */ + if (ei->pde) + pde_put(ei->pde); kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } @@ -95,7 +92,7 @@ void __init proc_init_kmemcache(void) proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT| + SLAB_ACCOUNT| SLAB_PANIC), init_once); pde_opener_cache = @@ -110,18 +107,15 @@ void __init proc_init_kmemcache(void) void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock) { - struct inode *inode; - struct proc_inode *ei; struct hlist_node *node; struct super_block *old_sb = NULL; rcu_read_lock(); - for (;;) { + while ((node = hlist_first_rcu(inodes))) { + struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes); struct super_block *sb; - node = hlist_first_rcu(inodes); - if (!node) - break; - ei = hlist_entry(node, struct proc_inode, sibling_inodes); + struct inode *inode; + spin_lock(lock); hlist_del_init_rcu(&ei->sibling_inodes); spin_unlock(lock); @@ -193,7 +187,7 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .free_inode = proc_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = proc_evict_inode, .statfs = simple_statfs, .show_options = proc_show_options, @@ -309,9 +303,7 @@ static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter) static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos) { - typeof_member(struct proc_ops, proc_read) read; - - read = pde->proc_ops->proc_read; + const auto read = pde->proc_ops->proc_read; if (read) return read(file, buf, count, ppos); return -EIO; @@ -333,9 +325,7 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - typeof_member(struct proc_ops, proc_write) write; - - write = pde->proc_ops->proc_write; + const auto write = pde->proc_ops->proc_write; if (write) return write(file, buf, count, ppos); return -EIO; @@ -357,9 +347,7 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts) { - typeof_member(struct proc_ops, proc_poll) poll; - - poll = pde->proc_ops->proc_poll; + const auto poll = pde->proc_ops->proc_poll; if (poll) return poll(file, pts); return DEFAULT_POLLMASK; @@ -381,9 +369,7 @@ static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) { - typeof_member(struct proc_ops, proc_ioctl) ioctl; - - ioctl = pde->proc_ops->proc_ioctl; + const auto ioctl = pde->proc_ops->proc_ioctl; if (ioctl) return ioctl(file, cmd, arg); return -ENOTTY; @@ -406,9 +392,7 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne #ifdef CONFIG_COMPAT static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) { - typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; - - compat_ioctl = pde->proc_ops->proc_compat_ioctl; + const auto compat_ioctl = pde->proc_ops->proc_compat_ioctl; if (compat_ioctl) return compat_ioctl(file, cmd, arg); return -ENOTTY; @@ -430,9 +414,7 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma) { - typeof_member(struct proc_ops, proc_mmap) mmap; - - mmap = pde->proc_ops->proc_mmap; + const auto mmap = pde->proc_ops->proc_mmap; if (mmap) return mmap(file, vma); return -EIO; @@ -457,15 +439,13 @@ pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned lo unsigned long len, unsigned long pgoff, unsigned long flags) { - typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; + if (pde->proc_ops->proc_get_unmapped_area) + return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags); - get_area = pde->proc_ops->proc_get_unmapped_area; #ifdef CONFIG_MMU - if (!get_area) - get_area = current->mm->get_unmapped_area; + return mm_get_unmapped_area(file, orig_addr, len, pgoff, flags); #endif - if (get_area) - return get_area(file, orig_addr, len, pgoff, flags); + return orig_addr; } @@ -491,10 +471,9 @@ static int proc_reg_open(struct inode *inode, struct file *file) struct proc_dir_entry *pde = PDE(inode); int rv = 0; typeof_member(struct proc_ops, proc_open) open; - typeof_member(struct proc_ops, proc_release) release; struct pde_opener *pdeo; - if (!pde->proc_ops->proc_lseek) + if (!pde_has_proc_lseek(pde)) file->f_mode &= ~FMODE_LSEEK; if (pde_is_permanent(pde)) { @@ -518,7 +497,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) if (!use_pde(pde)) return -ENOENT; - release = pde->proc_ops->proc_release; + const auto release = pde->proc_ops->proc_release; if (release) { pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL); if (!pdeo) { @@ -555,12 +534,9 @@ static int proc_reg_release(struct inode *inode, struct file *file) struct pde_opener *pdeo; if (pde_is_permanent(pde)) { - typeof_member(struct proc_ops, proc_release) release; - - release = pde->proc_ops->proc_release; - if (release) { + const auto release = pde->proc_ops->proc_release; + if (release) return release(inode, file); - } return 0; } @@ -591,7 +567,7 @@ static const struct file_operations proc_iter_file_ops = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, .write = proc_reg_write, - .splice_read = generic_file_splice_read, + .splice_read = copy_splice_read, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, @@ -617,7 +593,7 @@ static const struct file_operations proc_reg_file_ops_compat = { static const struct file_operations proc_iter_file_ops_compat = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, - .splice_read = generic_file_splice_read, + .splice_read = copy_splice_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, @@ -660,7 +636,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_private = de->data; inode->i_ino = de->low_ino; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); PROC_I(inode)->pde = de; if (is_empty_pde(de)) { make_empty_dir_inode(inode); @@ -679,13 +655,13 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) if (S_ISREG(inode->i_mode)) { inode->i_op = de->proc_iops; - if (de->proc_ops->proc_read_iter) + if (pde_has_proc_read_iter(de)) inode->i_fop = &proc_iter_file_ops; else inode->i_fop = &proc_reg_file_ops; #ifdef CONFIG_COMPAT - if (de->proc_ops->proc_compat_ioctl) { - if (de->proc_ops->proc_read_iter) + if (pde_has_proc_compat_ioctl(de)) { + if (pde_has_proc_read_iter(de)) inode->i_fop = &proc_iter_file_ops_compat; else inode->i_fop = &proc_reg_file_ops_compat; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index b701d0207edf..c1e8eb984da8 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -13,6 +13,7 @@ #include <linux/binfmts.h> #include <linux/sched/coredump.h> #include <linux/sched/task.h> +#include <linux/mm.h> struct ctl_table_header; struct mempolicy; @@ -43,7 +44,6 @@ struct proc_dir_entry { const struct proc_ops *proc_ops; const struct file_operations *proc_dir_ops; }; - const struct dentry_operations *proc_dops; union { const struct seq_operations *seq_ops; int (*single_show)(struct seq_file *, void *); @@ -84,6 +84,25 @@ static inline void pde_make_permanent(struct proc_dir_entry *pde) pde->flags |= PROC_ENTRY_PERMANENT; } +static inline bool pde_has_proc_read_iter(const struct proc_dir_entry *pde) +{ + return pde->flags & PROC_ENTRY_proc_read_iter; +} + +static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde) +{ +#ifdef CONFIG_COMPAT + return pde->flags & PROC_ENTRY_proc_compat_ioctl; +#else + return false; +#endif +} + +static inline bool pde_has_proc_lseek(const struct proc_dir_entry *pde) +{ + return pde->flags & PROC_ENTRY_proc_lseek; +} + extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); @@ -92,7 +111,7 @@ union proc_op { int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); - const char *lsm; + int lsmid; }; struct proc_inode { @@ -101,7 +120,7 @@ struct proc_inode { union proc_op op; struct proc_dir_entry *pde; struct ctl_table_header *sysctl; - struct ctl_table *sysctl_entry; + const struct ctl_table *sysctl_entry; struct hlist_node sibling_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; @@ -142,6 +161,80 @@ unsigned name_to_int(const struct qstr *qstr); /* Worst case buffer size needed for holding an integer. */ #define PROC_NUMBUF 13 +#ifdef CONFIG_PAGE_MAPCOUNT +/** + * folio_precise_page_mapcount() - Number of mappings of this folio page. + * @folio: The folio. + * @page: The page. + * + * The number of present user page table entries that reference this page + * as tracked via the RMAP: either referenced directly (PTE) or as part of + * a larger area that covers this page (e.g., PMD). + * + * Use this function only for the calculation of existing statistics + * (USS, PSS, mapcount_max) and for debugging purposes (/proc/kpagecount). + * + * Do not add new users. + * + * Returns: The number of mappings of this folio page. 0 for + * folios that are not mapped to user space or are not tracked via the RMAP + * (e.g., shared zeropage). + */ +static inline int folio_precise_page_mapcount(struct folio *folio, + struct page *page) +{ + int mapcount = atomic_read(&page->_mapcount) + 1; + + if (page_mapcount_is_type(mapcount)) + mapcount = 0; + if (folio_test_large(folio)) + mapcount += folio_entire_mapcount(folio); + + return mapcount; +} +#else /* !CONFIG_PAGE_MAPCOUNT */ +static inline int folio_precise_page_mapcount(struct folio *folio, + struct page *page) +{ + BUILD_BUG(); +} +#endif /* CONFIG_PAGE_MAPCOUNT */ + +/** + * folio_average_page_mapcount() - Average number of mappings per page in this + * folio + * @folio: The folio. + * + * The average number of user page table entries that reference each page in + * this folio as tracked via the RMAP: either referenced directly (PTE) or + * as part of a larger area that covers this page (e.g., PMD). + * + * The average is calculated by rounding to the nearest integer; however, + * to avoid duplicated code in current callers, the average is at least + * 1 if any page of the folio is mapped. + * + * Returns: The average number of mappings per page in this folio. + */ +static inline int folio_average_page_mapcount(struct folio *folio) +{ + int mapcount, entire_mapcount, avg; + + if (!folio_test_large(folio)) + return atomic_read(&folio->_mapcount) + 1; + + mapcount = folio_large_mapcount(folio); + if (unlikely(mapcount <= 0)) + return 0; + entire_mapcount = folio_entire_mapcount(folio); + if (mapcount <= entire_mapcount) + return entire_mapcount; + mapcount -= entire_mapcount; + + /* Round to closest integer ... */ + avg = ((unsigned int)mapcount + folio_large_nr_pages(folio) / 2) >> folio_large_order(folio); + /* ... but return at least 1. */ + return max_t(int, avg + entire_mapcount, 1); +} /* * array.c */ @@ -162,9 +255,9 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, * base.c */ extern const struct dentry_operations pid_dentry_operations; -extern int pid_getattr(struct user_namespace *, const struct path *, +extern int pid_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); -extern int proc_setattr(struct user_namespace *, struct dentry *, +extern int proc_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void proc_pid_evict_inode(struct proc_inode *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); @@ -280,18 +373,27 @@ static inline void proc_tty_init(void) {} extern struct proc_dir_entry proc_root; extern void proc_self_init(void); +extern unsigned self_inum, thread_self_inum; /* * task_[no]mmu.c */ struct mem_size_stats; + +struct proc_maps_locking_ctx { + struct mm_struct *mm; +#ifdef CONFIG_PER_VMA_LOCK + bool mmap_locked; + struct vm_area_struct *locked_vma; +#endif +}; + struct proc_maps_private { struct inode *inode; struct task_struct *task; - struct mm_struct *mm; -#ifdef CONFIG_MMU struct vma_iterator iter; -#endif + loff_t last_pos; + struct proc_maps_locking_ctx lock_ctx; #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif @@ -316,5 +418,17 @@ extern const struct dentry_operations proc_net_dentry_ops; static inline void pde_force_lookup(struct proc_dir_entry *pde) { /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ - pde->proc_dops = &proc_net_dentry_ops; + pde->flags |= PROC_ENTRY_FORCE_LOOKUP; +} + +/* + * Add a new procfs dentry that can't serve as a mountpoint. That should + * encompass anything that is ephemeral and can just disappear while the + * process is still around. + */ +static inline struct dentry *proc_splice_unmountable(struct inode *inode, + struct dentry *dentry, const struct dentry_operations *d_ops) +{ + dont_mount(dentry); + return d_splice_alias_ops(inode, dentry, d_ops); } diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c index cb0edc7cbf09..714a22ded8a8 100644 --- a/fs/proc/interrupts.c +++ b/fs/proc/interrupts.c @@ -11,13 +11,13 @@ */ static void *int_seq_start(struct seq_file *f, loff_t *pos) { - return (*pos <= nr_irqs) ? pos : NULL; + return *pos <= irq_get_nr_irqs() ? pos : NULL; } static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos) { (*pos)++; - if (*pos > nr_irqs) + if (*pos > irq_get_nr_irqs()) return NULL; return pos; } diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 71157ee35c1a..728630b10fdf 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -10,7 +10,7 @@ * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com> */ -#include <linux/crash_core.h> +#include <linux/vmcore_info.h> #include <linux/mm.h> #include <linux/proc_fs.h> #include <linux/kcore.h> @@ -24,7 +24,7 @@ #include <linux/memblock.h> #include <linux/init.h> #include <linux/slab.h> -#include <linux/uaccess.h> +#include <linux/uio.h> #include <asm/io.h> #include <linux/list.h> #include <linux/ioport.h> @@ -34,8 +34,6 @@ #include <asm/sections.h> #include "internal.h" -#define CORE_STR "CORE" - #ifndef ELF_CORE_EFLAGS #define ELF_CORE_EFLAGS 0 #endif @@ -50,8 +48,26 @@ static struct proc_dir_entry *proc_root_kcore; #define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET) #endif +#ifndef kc_xlate_dev_mem_ptr +#define kc_xlate_dev_mem_ptr kc_xlate_dev_mem_ptr +static inline void *kc_xlate_dev_mem_ptr(phys_addr_t phys) +{ + return __va(phys); +} +#endif +#ifndef kc_unxlate_dev_mem_ptr +#define kc_unxlate_dev_mem_ptr kc_unxlate_dev_mem_ptr +static inline void kc_unxlate_dev_mem_ptr(phys_addr_t phys, void *virt) +{ +} +#endif + static LIST_HEAD(kclist_head); -static DECLARE_RWSEM(kclist_lock); +static int kcore_nphdr; +static size_t kcore_phdrs_len; +static size_t kcore_notes_len; +static size_t kcore_data_offset; +DEFINE_STATIC_PERCPU_RWSEM(kclist_lock); static int kcore_need_update = 1; /* @@ -87,33 +103,34 @@ void __init kclist_add(struct kcore_list *new, void *addr, size_t size, list_add_tail(&new->list, &kclist_head); } -static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len, - size_t *data_offset) +static void update_kcore_size(void) { size_t try, size; struct kcore_list *m; - *nphdr = 1; /* PT_NOTE */ + kcore_nphdr = 1; /* PT_NOTE */ size = 0; list_for_each_entry(m, &kclist_head, list) { try = kc_vaddr_to_offset((size_t)m->addr + m->size); if (try > size) size = try; - *nphdr = *nphdr + 1; + kcore_nphdr++; } - *phdrs_len = *nphdr * sizeof(struct elf_phdr); - *notes_len = (4 * sizeof(struct elf_note) + - 3 * ALIGN(sizeof(CORE_STR), 4) + - VMCOREINFO_NOTE_NAME_BYTES + - ALIGN(sizeof(struct elf_prstatus), 4) + - ALIGN(sizeof(struct elf_prpsinfo), 4) + - ALIGN(arch_task_struct_size, 4) + - ALIGN(vmcoreinfo_size, 4)); - *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len + - *notes_len); - return *data_offset + size; + kcore_phdrs_len = kcore_nphdr * sizeof(struct elf_phdr); + kcore_notes_len = (4 * sizeof(struct elf_note) + + ALIGN(sizeof(NN_PRSTATUS), 4) + + ALIGN(sizeof(NN_PRPSINFO), 4) + + ALIGN(sizeof(NN_TASKSTRUCT), 4) + + VMCOREINFO_NOTE_NAME_BYTES + + ALIGN(sizeof(struct elf_prstatus), 4) + + ALIGN(sizeof(struct elf_prpsinfo), 4) + + ALIGN(arch_task_struct_size, 4) + + ALIGN(vmcoreinfo_size, 4)); + kcore_data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + kcore_phdrs_len + + kcore_notes_len); + proc_root_kcore->size = kcore_data_offset + size; } #ifdef CONFIG_HIGHMEM @@ -199,7 +216,7 @@ kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg) ent->addr = (unsigned long)page_to_virt(p); ent->size = nr_pages << PAGE_SHIFT; - if (!virt_addr_valid(ent->addr)) + if (!virt_addr_valid((void *)ent->addr)) goto free_out; /* cut not-mapped area. ....from ppc-32 code. */ @@ -235,7 +252,7 @@ static int kcore_ram_list(struct list_head *list) int nid, ret; unsigned long end_pfn; - /* Not inialized....update now */ + /* Not initialized....update now */ /* find out "max pfn" */ end_pfn = 0; for_each_node_state(nid, N_MEMORY) { @@ -256,12 +273,10 @@ static int kcore_update_ram(void) { LIST_HEAD(list); LIST_HEAD(garbage); - int nphdr; - size_t phdrs_len, notes_len, data_offset; struct kcore_list *tmp, *pos; int ret = 0; - down_write(&kclist_lock); + percpu_down_write(&kclist_lock); if (!xchg(&kcore_need_update, 0)) goto out; @@ -279,11 +294,10 @@ static int kcore_update_ram(void) } list_splice_tail(&list, &kclist_head); - proc_root_kcore->size = get_kcore_size(&nphdr, &phdrs_len, ¬es_len, - &data_offset); + update_kcore_size(); out: - up_write(&kclist_lock); + percpu_up_write(&kclist_lock); list_for_each_entry_safe(pos, tmp, &garbage, list) { list_del(&pos->list); kfree(pos); @@ -307,30 +321,29 @@ static void append_kcore_note(char *notes, size_t *i, const char *name, *i = ALIGN(*i + descsz, 4); } -static ssize_t -read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) +static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) { + struct file *file = iocb->ki_filp; char *buf = file->private_data; - size_t phdrs_offset, notes_offset, data_offset; + loff_t *fpos = &iocb->ki_pos; + size_t phdrs_offset, notes_offset; size_t page_offline_frozen = 1; - size_t phdrs_len, notes_len; struct kcore_list *m; size_t tsz; - int nphdr; unsigned long start; + size_t buflen = iov_iter_count(iter); size_t orig_buflen = buflen; int ret = 0; - down_read(&kclist_lock); + percpu_down_read(&kclist_lock); /* * Don't race against drivers that set PageOffline() and expect no * further page access. */ page_offline_freeze(); - get_kcore_size(&nphdr, &phdrs_len, ¬es_len, &data_offset); phdrs_offset = sizeof(struct elfhdr); - notes_offset = phdrs_offset + phdrs_len; + notes_offset = phdrs_offset + kcore_phdrs_len; /* ELF file header. */ if (buflen && *fpos < sizeof(struct elfhdr)) { @@ -352,25 +365,24 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) .e_flags = ELF_CORE_EFLAGS, .e_ehsize = sizeof(struct elfhdr), .e_phentsize = sizeof(struct elf_phdr), - .e_phnum = nphdr, + .e_phnum = kcore_nphdr, }; tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos); - if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) { + if (copy_to_iter((char *)&ehdr + *fpos, tsz, iter) != tsz) { ret = -EFAULT; goto out; } - buffer += tsz; buflen -= tsz; *fpos += tsz; } /* ELF program headers. */ - if (buflen && *fpos < phdrs_offset + phdrs_len) { + if (buflen && *fpos < phdrs_offset + kcore_phdrs_len) { struct elf_phdr *phdrs, *phdr; - phdrs = kzalloc(phdrs_len, GFP_KERNEL); + phdrs = kzalloc(kcore_phdrs_len, GFP_KERNEL); if (!phdrs) { ret = -ENOMEM; goto out; @@ -378,13 +390,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) phdrs[0].p_type = PT_NOTE; phdrs[0].p_offset = notes_offset; - phdrs[0].p_filesz = notes_len; + phdrs[0].p_filesz = kcore_notes_len; phdr = &phdrs[1]; list_for_each_entry(m, &kclist_head, list) { phdr->p_type = PT_LOAD; phdr->p_flags = PF_R | PF_W | PF_X; - phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset; + phdr->p_offset = kc_vaddr_to_offset(m->addr) + + kcore_data_offset; phdr->p_vaddr = (size_t)m->addr; if (m->type == KCORE_RAM) phdr->p_paddr = __pa(m->addr); @@ -397,22 +410,22 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) phdr++; } - tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos); - if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset, - tsz)) { + tsz = min_t(size_t, buflen, + phdrs_offset + kcore_phdrs_len - *fpos); + if (copy_to_iter((char *)phdrs + *fpos - phdrs_offset, tsz, + iter) != tsz) { kfree(phdrs); ret = -EFAULT; goto out; } kfree(phdrs); - buffer += tsz; buflen -= tsz; *fpos += tsz; } /* ELF note segment. */ - if (buflen && *fpos < notes_offset + notes_len) { + if (buflen && *fpos < notes_offset + kcore_notes_len) { struct elf_prstatus prstatus = {}; struct elf_prpsinfo prpsinfo = { .pr_sname = 'R', @@ -421,20 +434,20 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) char *notes; size_t i = 0; - strlcpy(prpsinfo.pr_psargs, saved_command_line, + strscpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs)); - notes = kzalloc(notes_len, GFP_KERNEL); + notes = kzalloc(kcore_notes_len, GFP_KERNEL); if (!notes) { ret = -ENOMEM; goto out; } - append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus, + append_kcore_note(notes, &i, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); - append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo, + append_kcore_note(notes, &i, NN_PRPSINFO, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo)); - append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current, + append_kcore_note(notes, &i, NN_TASKSTRUCT, NT_TASKSTRUCT, current, arch_task_struct_size); /* * vmcoreinfo_size is mostly constant after init time, but it @@ -445,17 +458,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) */ append_kcore_note(notes, &i, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, - min(vmcoreinfo_size, notes_len - i)); + min(vmcoreinfo_size, kcore_notes_len - i)); - tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos); - if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) { + tsz = min_t(size_t, buflen, + notes_offset + kcore_notes_len - *fpos); + if (copy_to_iter(notes + *fpos - notes_offset, tsz, iter) != tsz) { kfree(notes); ret = -EFAULT; goto out; } kfree(notes); - buffer += tsz; buflen -= tsz; *fpos += tsz; } @@ -464,7 +477,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) * Check to see if our file offset matches with any of * the addresses in the elf_phdr on our list. */ - start = kc_offset_to_vaddr(*fpos - data_offset); + start = kc_offset_to_vaddr(*fpos - kcore_data_offset); if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) tsz = buflen; @@ -472,19 +485,21 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) while (buflen) { struct page *page; unsigned long pfn; + phys_addr_t phys; + void *__start; /* * If this is the first iteration or the address is not within * the previous entry, search for a matching entry. */ if (!m || start < m->addr || start >= m->addr + m->size) { - struct kcore_list *iter; + struct kcore_list *pos; m = NULL; - list_for_each_entry(iter, &kclist_head, list) { - if (start >= iter->addr && - start < iter->addr + iter->size) { - m = iter; + list_for_each_entry(pos, &kclist_head, list) { + if (start >= pos->addr && + start < pos->addr + pos->size) { + m = pos; break; } } @@ -497,7 +512,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) } if (!m) { - if (clear_user(buffer, tsz)) { + if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -506,22 +521,40 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) switch (m->type) { case KCORE_VMALLOC: - vread(buf, (char *)start, tsz); - /* we have to zero-fill user buffer even if no read */ - if (copy_to_user(buffer, buf, tsz)) { - ret = -EFAULT; - goto out; + { + const char *src = (char *)start; + size_t read = 0, left = tsz; + + /* + * vmalloc uses spinlocks, so we optimistically try to + * read memory. If this fails, fault pages in and try + * again until we are done. + */ + while (true) { + read += vread_iter(iter, src, left); + if (read == tsz) + break; + + src += read; + left -= read; + + if (fault_in_iov_iter_writeable(iter, left)) { + ret = -EFAULT; + goto out; + } } break; + } case KCORE_USER: /* User page is handled prior to normal kernel page: */ - if (copy_to_user(buffer, (char *)start, tsz)) { + if (copy_to_iter((char *)start, tsz, iter) != tsz) { ret = -EFAULT; goto out; } break; case KCORE_RAM: - pfn = __pa(start) >> PAGE_SHIFT; + phys = __pa(start); + pfn = phys >> PAGE_SHIFT; page = pfn_to_online_page(pfn); /* @@ -530,8 +563,9 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) * and explicitly excluded physical ranges. */ if (!page || PageOffline(page) || - is_page_hwpoison(page) || !pfn_is_ram(pfn)) { - if (clear_user(buffer, tsz)) { + is_page_hwpoison(page) || !pfn_is_ram(pfn) || + pfn_is_unaccepted_memory(pfn)) { + if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -540,25 +574,45 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) fallthrough; case KCORE_VMEMMAP: case KCORE_TEXT: - /* - * Using bounce buffer to bypass the - * hardened user copy kernel text checks. - */ - if (copy_from_kernel_nofault(buf, (void *)start, tsz)) { - if (clear_user(buffer, tsz)) { - ret = -EFAULT; + if (m->type == KCORE_RAM) { + __start = kc_xlate_dev_mem_ptr(phys); + if (!__start) { + ret = -ENOMEM; + if (iov_iter_zero(tsz, iter) != tsz) + ret = -EFAULT; goto out; } } else { - if (copy_to_user(buffer, buf, tsz)) { + __start = (void *)start; + } + + /* + * Sadly we must use a bounce buffer here to be able to + * make use of copy_from_kernel_nofault(), as these + * memory regions might not always be mapped on all + * architectures. + */ + ret = copy_from_kernel_nofault(buf, __start, tsz); + if (m->type == KCORE_RAM) + kc_unxlate_dev_mem_ptr(phys, __start); + if (ret) { + if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; } + ret = 0; + /* + * We know the bounce buffer is safe to copy from, so + * use _copy_to_iter() directly. + */ + } else if (_copy_to_iter(buf, tsz, iter) != tsz) { + ret = -EFAULT; + goto out; } break; default: pr_warn_once("Unhandled KCORE type: %d\n", m->type); - if (clear_user(buffer, tsz)) { + if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -566,14 +620,13 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) skip: buflen -= tsz; *fpos += tsz; - buffer += tsz; start += tsz; tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen); } out: page_offline_thaw(); - up_read(&kclist_lock); + percpu_up_read(&kclist_lock); if (ret) return ret; return orig_buflen - buflen; @@ -610,7 +663,8 @@ static int release_kcore(struct inode *inode, struct file *file) } static const struct proc_ops kcore_proc_ops = { - .proc_read = read_kcore, + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_read_iter = read_kcore_iter, .proc_open = open_kcore, .proc_release = release_kcore, .proc_lseek = default_llseek, diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 440960110a42..a458f1e112fd 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -6,6 +6,7 @@ #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/mmzone.h> +#include <linux/memblock.h> #include <linux/proc_fs.h> #include <linux/percpu.h> #include <linux/seq_file.h> @@ -16,6 +17,7 @@ #ifdef CONFIG_CMA #include <linux/cma.h> #endif +#include <linux/zswap.h> #include <asm/page.h> #include "internal.h" @@ -87,10 +89,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SwapTotal: ", i.totalswap); show_val_kb(m, "SwapFree: ", i.freeswap); #ifdef CONFIG_ZSWAP - seq_printf(m, "Zswap: %8lu kB\n", - (unsigned long)(zswap_pool_total_size >> 10)); + show_val_kb(m, "Zswap: ", zswap_total_pages()); seq_printf(m, "Zswapped: %8lu kB\n", - (unsigned long)atomic_read(&zswap_stored_pages) << + (unsigned long)atomic_long_read(&zswap_stored_pages) << (PAGE_SHIFT - 10)); #endif show_val_kb(m, "Dirty: ", @@ -119,10 +120,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_node_page_state(NR_SECONDARY_PAGETABLE)); show_val_kb(m, "NFS_Unstable: ", 0); - show_val_kb(m, "Bounce: ", - global_zone_page_state(NR_BOUNCE)); - show_val_kb(m, "WritebackTmp: ", - global_node_page_state(NR_WRITEBACK_TEMP)); + show_val_kb(m, "Bounce: ", 0); + show_val_kb(m, "WritebackTmp: ", 0); show_val_kb(m, "CommitLimit: ", vm_commit_limit()); show_val_kb(m, "Committed_AS: ", committed); seq_printf(m, "VmallocTotal: %8lu kB\n", @@ -131,6 +130,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); + memtest_report_meminfo(m); + #ifdef CONFIG_MEMORY_FAILURE seq_printf(m, "HardwareCorrupted: %5lu kB\n", atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)); @@ -155,6 +156,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_zone_page_state(NR_FREE_CMA_PAGES)); #endif +#ifdef CONFIG_UNACCEPTED_MEMORY + show_val_kb(m, "Unaccepted: ", + global_zone_page_state(NR_UNACCEPTED)); +#endif + show_val_kb(m, "Balloon: ", + global_node_page_state(NR_BALLOON_PAGES)); + hugetlb_report_meminfo(m); arch_report_meminfo(m); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 8e159fc78c0a..ea2b597fd92c 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -12,7 +12,7 @@ #include "internal.h" -static const struct proc_ns_operations *ns_entries[] = { +static const struct proc_ns_operations *const ns_entries[] = { #ifdef CONFIG_NET_NS &netns_operations, #endif @@ -83,7 +83,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { res = ns_get_name(name, sizeof(name), task, ns_ops); if (res >= 0) - res = readlink_copy(buffer, buflen, name); + res = readlink_copy(buffer, buflen, name, strlen(name)); } put_task_struct(task); return res; @@ -111,14 +111,13 @@ static struct dentry *proc_ns_instantiate(struct dentry *dentry, ei->ns_ops = ns_ops; pid_update_inode(task, inode); - d_set_d_op(dentry, &pid_dentry_operations); - return d_splice_alias(inode, dentry); + return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) { struct task_struct *task = get_proc_task(file_inode(file)); - const struct proc_ns_operations **entry, **last; + const struct proc_ns_operations *const *entry, *const *last; if (!task) return -ENOENT; @@ -152,7 +151,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct task_struct *task = get_proc_task(dir); - const struct proc_ns_operations **entry, **last; + const struct proc_ns_operations *const *entry, *const *last; unsigned int len = dentry->d_name.len; struct dentry *res = ERR_PTR(-ENOENT); diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 4d3493579458..c6e7ebc63756 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -58,7 +58,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) if (file) { seq_pad(m, ' '); - seq_file_path(m, file, ""); + seq_path(m, file_user_path(file), ""); } seq_putc(m, '\n'); diff --git a/fs/proc/page.c b/fs/proc/page.c index 6249c347809a..f9b2c2c906cd 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -20,7 +20,12 @@ #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) -#define KPMBITS (KPMSIZE * BITS_PER_BYTE) + +enum kpage_operation { + KPAGE_FLAGS, + KPAGE_COUNT, + KPAGE_CGROUP, +}; static inline unsigned long get_max_dump_pfn(void) { @@ -37,21 +42,33 @@ static inline unsigned long get_max_dump_pfn(void) #endif } -/* /proc/kpagecount - an array exposing page counts - * - * Each entry is a u64 representing the corresponding - * physical page count. - */ -static ssize_t kpagecount_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static u64 get_kpage_count(const struct page *page) +{ + struct page_snapshot ps; + u64 ret; + + snapshot_page(&ps, page); + + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + ret = folio_precise_page_mapcount(&ps.folio_snapshot, + &ps.page_snapshot); + else + ret = folio_average_page_mapcount(&ps.folio_snapshot); + + return ret; +} + +static ssize_t kpage_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos, + enum kpage_operation op) { const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; - struct page *ppage; + struct page *page; unsigned long src = *ppos; unsigned long pfn; ssize_t ret = 0; - u64 pcount; + u64 info; pfn = src / KPMSIZE; if (src & KPMMASK || count & KPMMASK) @@ -65,14 +82,27 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, * TODO: ZONE_DEVICE support requires to identify * memmaps that were actually initialized. */ - ppage = pfn_to_online_page(pfn); - - if (!ppage || PageSlab(ppage) || page_has_type(ppage)) - pcount = 0; - else - pcount = page_mapcount(ppage); - - if (put_user(pcount, out)) { + page = pfn_to_online_page(pfn); + + if (page) { + switch (op) { + case KPAGE_FLAGS: + info = stable_page_flags(page); + break; + case KPAGE_COUNT: + info = get_kpage_count(page); + break; + case KPAGE_CGROUP: + info = page_cgroup_ino(page); + break; + default: + info = 0; + break; + } + } else + info = 0; + + if (put_user(info, out)) { ret = -EFAULT; break; } @@ -90,27 +120,37 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, return ret; } +/* /proc/kpagecount - an array exposing page mapcounts + * + * Each entry is a u64 representing the corresponding + * physical page mapcount. + */ +static ssize_t kpagecount_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return kpage_read(file, buf, count, ppos, KPAGE_COUNT); +} + static const struct proc_ops kpagecount_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpagecount_read, }; -/* /proc/kpageflags - an array exposing page flags - * - * Each entry is a u64 representing the corresponding - * physical page flags. - */ static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) { return ((kflags >> kbit) & 1) << ubit; } -u64 stable_page_flags(struct page *page) +u64 stable_page_flags(const struct page *page) { - u64 k; - u64 u; + const struct folio *folio; + struct page_snapshot ps; + unsigned long k; + unsigned long mapping; + bool is_anon; + u64 u = 0; /* * pseudo flag: KPF_NOPAGE @@ -119,76 +159,63 @@ u64 stable_page_flags(struct page *page) if (!page) return 1 << KPF_NOPAGE; - k = page->flags; - u = 0; + snapshot_page(&ps, page); + folio = &ps.folio_snapshot; + + k = folio->flags.f; + mapping = (unsigned long)folio->mapping; + is_anon = mapping & FOLIO_MAPPING_ANON; /* * pseudo flags for the well known (anonymous) memory mapped pages - * - * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the - * simple test in page_mapped() is not enough. */ - if (!PageSlab(page) && page_mapped(page)) + if (folio_mapped(folio)) u |= 1 << KPF_MMAP; - if (PageAnon(page)) + if (is_anon) { u |= 1 << KPF_ANON; - if (PageKsm(page)) - u |= 1 << KPF_KSM; + if (mapping & FOLIO_MAPPING_KSM) + u |= 1 << KPF_KSM; + } /* * compound pages: export both head/tail info * they together define a compound page's start/end pos and order */ - if (PageHead(page)) - u |= 1 << KPF_COMPOUND_HEAD; - if (PageTail(page)) + if (ps.idx == 0) + u |= kpf_copy_bit(k, KPF_COMPOUND_HEAD, PG_head); + else u |= 1 << KPF_COMPOUND_TAIL; - if (PageHuge(page)) + if (folio_test_hugetlb(folio)) u |= 1 << KPF_HUGE; - /* - * PageTransCompound can be true for non-huge compound pages (slab - * pages or pages allocated by drivers with __GFP_COMP) because it - * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon - * to make sure a given page is a thp, not a non-huge compound page. - */ - else if (PageTransCompound(page)) { - struct page *head = compound_head(page); - - if (PageLRU(head) || PageAnon(head)) - u |= 1 << KPF_THP; - else if (is_huge_zero_page(head)) { - u |= 1 << KPF_ZERO_PAGE; - u |= 1 << KPF_THP; - } - } else if (is_zero_pfn(page_to_pfn(page))) + else if (folio_test_large(folio) && + folio_test_large_rmappable(folio)) { + /* Note: we indicate any THPs here, not just PMD-sized ones */ + u |= 1 << KPF_THP; + } else if (is_huge_zero_pfn(ps.pfn)) { u |= 1 << KPF_ZERO_PAGE; + u |= 1 << KPF_THP; + } else if (is_zero_pfn(ps.pfn)) { + u |= 1 << KPF_ZERO_PAGE; + } - - /* - * Caveats on high order pages: page->_refcount will only be set - * -1 on the head page; SLUB/SLQB do the same for PG_slab; - * SLOB won't set PG_slab at all on compound pages. - */ - if (PageBuddy(page)) - u |= 1 << KPF_BUDDY; - else if (page_count(page) == 0 && is_free_buddy_page(page)) + if (ps.flags & PAGE_SNAPSHOT_PG_BUDDY) u |= 1 << KPF_BUDDY; - if (PageOffline(page)) + if (folio_test_offline(folio)) u |= 1 << KPF_OFFLINE; - if (PageTable(page)) + if (folio_test_pgtable(folio)) u |= 1 << KPF_PGTABLE; + if (folio_test_slab(folio)) + u |= 1 << KPF_SLAB; - if (page_is_idle(page)) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) + u |= kpf_copy_bit(k, KPF_IDLE, PG_idle); +#else + if (ps.flags & PAGE_SNAPSHOT_PG_IDLE) u |= 1 << KPF_IDLE; +#endif u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); - - u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); - if (PageTail(page) && PageSlab(compound_head(page))) - u |= 1 << KPF_SLAB; - - u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate); u |= kpf_copy_bit(k, KPF_WRITEBACK, PG_writeback); @@ -198,7 +225,8 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active); u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim); - if (PageSwapCache(page)) +#define SWAPCACHE ((1 << PG_swapbacked) | (1 << PG_swapcache)) + if ((k & SWAPCACHE) == SWAPCACHE) u |= 1 << KPF_SWAPCACHE; u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); @@ -206,67 +234,38 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); #ifdef CONFIG_MEMORY_FAILURE - u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); -#endif - -#ifdef CONFIG_ARCH_USES_PG_UNCACHED - u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); + if (u & (1 << KPF_HUGE)) + u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); + else + u |= kpf_copy_bit(ps.page_snapshot.flags.f, KPF_HWPOISON, PG_hwpoison); #endif u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved); - u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk); + u |= kpf_copy_bit(k, KPF_OWNER_2, PG_owner_2); u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private); u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); -#ifdef CONFIG_ARCH_USES_PG_ARCH_X +#ifdef CONFIG_ARCH_USES_PG_ARCH_2 u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2); +#endif +#ifdef CONFIG_ARCH_USES_PG_ARCH_3 u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3); #endif return u; -}; +} +EXPORT_SYMBOL_GPL(stable_page_flags); +/* /proc/kpageflags - an array exposing page flags + * + * Each entry is a u64 representing the corresponding + * physical page flags. + */ static ssize_t kpageflags_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { - const unsigned long max_dump_pfn = get_max_dump_pfn(); - u64 __user *out = (u64 __user *)buf; - struct page *ppage; - unsigned long src = *ppos; - unsigned long pfn; - ssize_t ret = 0; - - pfn = src / KPMSIZE; - if (src & KPMMASK || count & KPMMASK) - return -EINVAL; - if (src >= max_dump_pfn * KPMSIZE) - return 0; - count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); - - while (count > 0) { - /* - * TODO: ZONE_DEVICE support requires to identify - * memmaps that were actually initialized. - */ - ppage = pfn_to_online_page(pfn); - - if (put_user(stable_page_flags(ppage), out)) { - ret = -EFAULT; - break; - } - - pfn++; - out++; - count -= KPMSIZE; - - cond_resched(); - } - - *ppos += (char __user *)out - buf; - if (!ret) - ret = (char __user *)out - buf; - return ret; + return kpage_read(file, buf, count, ppos, KPAGE_FLAGS); } static const struct proc_ops kpageflags_proc_ops = { @@ -277,53 +276,10 @@ static const struct proc_ops kpageflags_proc_ops = { #ifdef CONFIG_MEMCG static ssize_t kpagecgroup_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { - const unsigned long max_dump_pfn = get_max_dump_pfn(); - u64 __user *out = (u64 __user *)buf; - struct page *ppage; - unsigned long src = *ppos; - unsigned long pfn; - ssize_t ret = 0; - u64 ino; - - pfn = src / KPMSIZE; - if (src & KPMMASK || count & KPMMASK) - return -EINVAL; - if (src >= max_dump_pfn * KPMSIZE) - return 0; - count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); - - while (count > 0) { - /* - * TODO: ZONE_DEVICE support requires to identify - * memmaps that were actually initialized. - */ - ppage = pfn_to_online_page(pfn); - - if (ppage) - ino = page_cgroup_ino(ppage); - else - ino = 0; - - if (put_user(ino, out)) { - ret = -EFAULT; - break; - } - - pfn++; - out++; - count -= KPMSIZE; - - cond_resched(); - } - - *ppos += (char __user *)out - buf; - if (!ret) - ret = (char __user *)out - buf; - return ret; + return kpage_read(file, buf, count, ppos, KPAGE_CGROUP); } - static const struct proc_ops kpagecgroup_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 856839b8ae8b..52f0b75cbce2 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -135,6 +135,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_data); * @parent: The parent directory in which to create. * @ops: The seq_file ops with which to read the file. * @write: The write method with which to 'modify' the file. + * @state_size: The size of the per-file private state to allocate. * @data: Data for retrieval by pde_data(). * * Create a network namespaced proc file in the @parent directory with the @@ -299,7 +300,7 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir, return de; } -static int proc_tgid_net_getattr(struct user_namespace *mnt_userns, +static int proc_tgid_net_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -308,7 +309,7 @@ static int proc_tgid_net_getattr(struct user_namespace *mnt_userns, net = get_proc_task_net(inode); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (net != NULL) { stat->nlink = net->proc_net->nlink; @@ -321,6 +322,7 @@ static int proc_tgid_net_getattr(struct user_namespace *mnt_userns, const struct inode_operations proc_net_inode_operations = { .lookup = proc_tgid_net_lookup, .getattr = proc_tgid_net_getattr, + .setattr = proc_setattr, }; static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 48f2d60bd78a..49ab74e0bfde 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -17,10 +17,12 @@ #include <linux/bpf-cgroup.h> #include <linux/mount.h> #include <linux/kmemleak.h> +#include <linux/lockdep.h> #include "internal.h" -#define list_for_each_table_entry(entry, table) \ - for ((entry) = (table); (entry)->procname; (entry)++) +#define list_for_each_table_entry(entry, header) \ + entry = header->ctl_table; \ + for (size_t i = 0 ; i < header->ctl_table_size; ++i, entry++) static const struct dentry_operations proc_sys_dentry_operations; static const struct file_operations proc_sys_file_operations; @@ -28,9 +30,11 @@ static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; -/* Support for permanently empty directories */ - -struct ctl_table sysctl_mount_point[] = { +/* + * Support for permanently empty directories. + * Must be non-empty to avoid sharing an address with other tables. + */ +static const struct ctl_table sysctl_mount_point[] = { { } }; @@ -44,25 +48,16 @@ struct ctl_table sysctl_mount_point[] = { */ struct ctl_table_header *register_sysctl_mount_point(const char *path) { - return register_sysctl(path, sysctl_mount_point); + return register_sysctl_sz(path, sysctl_mount_point, 0); } EXPORT_SYMBOL(register_sysctl_mount_point); -static bool is_empty_dir(struct ctl_table_header *head) -{ - return head->ctl_table[0].child == sysctl_mount_point; -} - -static void set_empty_dir(struct ctl_dir *dir) -{ - dir->header.ctl_table[0].child = sysctl_mount_point; -} - -static void clear_empty_dir(struct ctl_dir *dir) - -{ - dir->header.ctl_table[0].child = NULL; -} +#define sysctl_is_perm_empty_ctl_header(hptr) \ + (hptr->type == SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY) +#define sysctl_set_perm_empty_ctl_header(hptr) \ + (hptr->type = SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY) +#define sysctl_clear_perm_empty_ctl_header(hptr) \ + (hptr->type = SYSCTL_TABLE_TYPE_DEFAULT) void proc_sys_poll_notify(struct ctl_table_poll *poll) { @@ -73,12 +68,11 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll) wake_up_interruptible(&poll->wait); } -static struct ctl_table root_table[] = { +static const struct ctl_table root_table[] = { { .procname = "", .mode = S_IFDIR|S_IRUGO|S_IXUGO, }, - { } }; static struct ctl_table_root sysctl_table_root = { .default_set.dir.header = { @@ -95,7 +89,7 @@ static DEFINE_SPINLOCK(sysctl_lock); static void drop_sysctl_table(struct ctl_table_header *header); static int sysctl_follow_link(struct ctl_table_header **phead, - struct ctl_table **pentry); + const struct ctl_table **pentry); static int insert_links(struct ctl_table_header *head); static void put_links(struct ctl_table_header *header); @@ -116,14 +110,15 @@ static int namecmp(const char *name1, int len1, const char *name2, int len2) return cmp; } -/* Called under sysctl_lock */ -static struct ctl_table *find_entry(struct ctl_table_header **phead, +static const struct ctl_table *find_entry(struct ctl_table_header **phead, struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; - struct ctl_table *entry; + const struct ctl_table *entry; struct rb_node *node = dir->root.rb_node; + lockdep_assert_held(&sysctl_lock); + while (node) { struct ctl_node *ctl_node; @@ -148,7 +143,7 @@ static struct ctl_table *find_entry(struct ctl_table_header **phead, return NULL; } -static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) +static int insert_entry(struct ctl_table_header *head, const struct ctl_table *entry) { struct rb_node *node = &head->node[entry - head->ctl_table].node; struct rb_node **p = &head->parent->root.rb_node; @@ -158,7 +153,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) while (*p) { struct ctl_table_header *parent_head; - struct ctl_table *parent_entry; + const struct ctl_table *parent_entry; struct ctl_node *parent_node; const char *parent_name; int cmp; @@ -187,7 +182,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) return 0; } -static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry) +static void erase_entry(struct ctl_table_header *head, const struct ctl_table *entry) { struct rb_node *node = &head->node[entry - head->ctl_table].node; @@ -196,9 +191,10 @@ static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry) static void init_header(struct ctl_table_header *head, struct ctl_table_root *root, struct ctl_table_set *set, - struct ctl_node *node, struct ctl_table *table) + struct ctl_node *node, const struct ctl_table *table, size_t table_size) { head->ctl_table = table; + head->ctl_table_size = table_size; head->ctl_table_arg = table; head->used = 0; head->count = 1; @@ -210,45 +206,49 @@ static void init_header(struct ctl_table_header *head, head->node = node; INIT_HLIST_HEAD(&head->inodes); if (node) { - struct ctl_table *entry; + const struct ctl_table *entry; - list_for_each_table_entry(entry, table) { + list_for_each_table_entry(entry, head) { node->header = head; node++; } } + if (table == sysctl_mount_point) + sysctl_set_perm_empty_ctl_header(head); } static void erase_header(struct ctl_table_header *head) { - struct ctl_table *entry; + const struct ctl_table *entry; - list_for_each_table_entry(entry, head->ctl_table) + list_for_each_table_entry(entry, head) erase_entry(head, entry); } static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) { - struct ctl_table *entry; + const struct ctl_table *entry; + struct ctl_table_header *dir_h = &dir->header; int err; + /* Is this a permanently empty directory? */ - if (is_empty_dir(&dir->header)) + if (sysctl_is_perm_empty_ctl_header(dir_h)) return -EROFS; /* Am I creating a permanently empty directory? */ - if (header->ctl_table == sysctl_mount_point) { + if (sysctl_is_perm_empty_ctl_header(header)) { if (!RB_EMPTY_ROOT(&dir->root)) return -EINVAL; - set_empty_dir(dir); + sysctl_set_perm_empty_ctl_header(dir_h); } - dir->header.nreg++; + dir_h->nreg++; header->parent = dir; err = insert_links(header); if (err) goto fail_links; - list_for_each_table_entry(entry, header->ctl_table) { + list_for_each_table_entry(entry, header) { err = insert_entry(header, entry); if (err) goto fail; @@ -259,24 +259,26 @@ fail: put_links(header); fail_links: if (header->ctl_table == sysctl_mount_point) - clear_empty_dir(dir); + sysctl_clear_perm_empty_ctl_header(dir_h); header->parent = NULL; - drop_sysctl_table(&dir->header); + drop_sysctl_table(dir_h); return err; } -/* called under sysctl_lock */ static int use_table(struct ctl_table_header *p) { + lockdep_assert_held(&sysctl_lock); + if (unlikely(p->unregistering)) return 0; p->used++; return 1; } -/* called under sysctl_lock */ static void unuse_table(struct ctl_table_header *p) { + lockdep_assert_held(&sysctl_lock); + if (!--p->used) if (unlikely(p->unregistering)) complete(p->unregistering); @@ -287,9 +289,11 @@ static void proc_sys_invalidate_dcache(struct ctl_table_header *head) proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock); } -/* called under sysctl_lock, will reacquire if has to wait */ static void start_unregistering(struct ctl_table_header *p) { + /* will reacquire if has to wait */ + lockdep_assert_held(&sysctl_lock); + /* * if p->used is 0, nobody will ever touch that entry again; * we'll eliminate all paths to it before dropping sysctl_lock @@ -346,12 +350,12 @@ lookup_header_set(struct ctl_table_root *root) return set; } -static struct ctl_table *lookup_entry(struct ctl_table_header **phead, - struct ctl_dir *dir, - const char *name, int namelen) +static const struct ctl_table *lookup_entry(struct ctl_table_header **phead, + struct ctl_dir *dir, + const char *name, int namelen) { struct ctl_table_header *head; - struct ctl_table *entry; + const struct ctl_table *entry; spin_lock(&sysctl_lock); entry = find_entry(&head, dir, name, namelen); @@ -376,10 +380,10 @@ static struct ctl_node *first_usable_entry(struct rb_node *node) } static void first_entry(struct ctl_dir *dir, - struct ctl_table_header **phead, struct ctl_table **pentry) + struct ctl_table_header **phead, const struct ctl_table **pentry) { struct ctl_table_header *head = NULL; - struct ctl_table *entry = NULL; + const struct ctl_table *entry = NULL; struct ctl_node *ctl_node; spin_lock(&sysctl_lock); @@ -393,10 +397,10 @@ static void first_entry(struct ctl_dir *dir, *pentry = entry; } -static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry) +static void next_entry(struct ctl_table_header **phead, const struct ctl_table **pentry) { struct ctl_table_header *head = *phead; - struct ctl_table *entry = *pentry; + const struct ctl_table *entry = *pentry; struct ctl_node *ctl_node = &head->node[entry - head->ctl_table]; spin_lock(&sysctl_lock); @@ -429,7 +433,7 @@ static int test_perm(int mode, int op) return -EACCES; } -static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op) +static int sysctl_perm(struct ctl_table_header *head, const struct ctl_table *table, int op) { struct ctl_table_root *root = head->root; int mode; @@ -443,7 +447,7 @@ static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, i } static struct inode *proc_sys_make_inode(struct super_block *sb, - struct ctl_table_header *head, struct ctl_table *table) + struct ctl_table_header *head, const struct ctl_table *table) { struct ctl_table_root *root = head->root; struct inode *inode; @@ -469,7 +473,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, head->count++; spin_unlock(&sysctl_lock); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_mode = table->mode; if (!S_ISDIR(table->mode)) { inode->i_mode |= S_IFREG; @@ -479,16 +483,14 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode->i_mode |= S_IFDIR; inode->i_op = &proc_sys_dir_operations; inode->i_fop = &proc_sys_dir_file_operations; - if (is_empty_dir(head)) + if (sysctl_is_perm_empty_ctl_header(head)) make_empty_dir_inode(inode); } + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; if (root->set_ownership) - root->set_ownership(head, table, &inode->i_uid, &inode->i_gid); - else { - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - } + root->set_ownership(head, &inode->i_uid, &inode->i_gid); return inode; } @@ -516,7 +518,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, struct ctl_table_header *head = grab_header(dir); struct ctl_table_header *h = NULL; const struct qstr *name = &dentry->d_name; - struct ctl_table *p; + const struct ctl_table *p; struct inode *inode; struct dentry *err = ERR_PTR(-ENOENT); struct ctl_dir *ctl_dir; @@ -539,13 +541,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, } inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); - if (IS_ERR(inode)) { - err = ERR_CAST(inode); - goto out; - } - - d_set_d_op(dentry, &proc_sys_dentry_operations); - err = d_splice_alias(inode, dentry); + err = d_splice_alias_ops(inode, dentry, &proc_sys_dentry_operations); out: if (h) @@ -559,7 +555,7 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, { struct inode *inode = file_inode(iocb->ki_filp); struct ctl_table_header *head = grab_header(inode); - struct ctl_table *table = PROC_I(inode)->sysctl_entry; + const struct ctl_table *table = PROC_I(inode)->sysctl_entry; size_t count = iov_iter_count(iter); char *kbuf; ssize_t error; @@ -633,7 +629,7 @@ static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter) static int proc_sys_open(struct inode *inode, struct file *filp) { struct ctl_table_header *head = grab_header(inode); - struct ctl_table *table = PROC_I(inode)->sysctl_entry; + const struct ctl_table *table = PROC_I(inode)->sysctl_entry; /* sysctl was unregistered */ if (IS_ERR(head)) @@ -651,7 +647,7 @@ static __poll_t proc_sys_poll(struct file *filp, poll_table *wait) { struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); - struct ctl_table *table = PROC_I(inode)->sysctl_entry; + const struct ctl_table *table = PROC_I(inode)->sysctl_entry; __poll_t ret = DEFAULT_POLLMASK; unsigned long event; @@ -682,7 +678,7 @@ out: static bool proc_sys_fill_cache(struct file *file, struct dir_context *ctx, struct ctl_table_header *head, - struct ctl_table *table) + const struct ctl_table *table) { struct dentry *child, *dir = file->f_path.dentry; struct inode *inode; @@ -703,20 +699,15 @@ static bool proc_sys_fill_cache(struct file *file, if (d_in_lookup(child)) { struct dentry *res; inode = proc_sys_make_inode(dir->d_sb, head, table); - if (IS_ERR(inode)) { - d_lookup_done(child); - dput(child); - return false; - } - d_set_d_op(child, &proc_sys_dentry_operations); - res = d_splice_alias(inode, child); + res = d_splice_alias_ops(inode, child, + &proc_sys_dentry_operations); d_lookup_done(child); if (unlikely(res)) { - if (IS_ERR(res)) { - dput(child); - return false; - } dput(child); + + if (IS_ERR(res)) + return false; + child = res; } } @@ -731,7 +722,7 @@ static bool proc_sys_fill_cache(struct file *file, static bool proc_sys_link_fill_cache(struct file *file, struct dir_context *ctx, struct ctl_table_header *head, - struct ctl_table *table) + const struct ctl_table *table) { bool ret = true; @@ -749,7 +740,7 @@ out: return ret; } -static int scan(struct ctl_table_header *head, struct ctl_table *table, +static int scan(struct ctl_table_header *head, const struct ctl_table *table, unsigned long *pos, struct file *file, struct dir_context *ctx) { @@ -773,7 +764,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx) { struct ctl_table_header *head = grab_header(file_inode(file)); struct ctl_table_header *h = NULL; - struct ctl_table *entry; + const struct ctl_table *entry; struct ctl_dir *ctl_dir; unsigned long pos; @@ -798,7 +789,7 @@ out: return 0; } -static int proc_sys_permission(struct user_namespace *mnt_userns, +static int proc_sys_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { /* @@ -806,7 +797,7 @@ static int proc_sys_permission(struct user_namespace *mnt_userns, * are _NOT_ writeable, capabilities or not. */ struct ctl_table_header *head; - struct ctl_table *table; + const struct ctl_table *table; int error; /* Executable files are not allowed under /proc/sys/ */ @@ -827,7 +818,7 @@ static int proc_sys_permission(struct user_namespace *mnt_userns, return error; } -static int proc_sys_setattr(struct user_namespace *mnt_userns, +static int proc_sys_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -836,27 +827,26 @@ static int proc_sys_setattr(struct user_namespace *mnt_userns, if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) return -EPERM; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; - setattr_copy(&init_user_ns, inode, attr); - mark_inode_dirty(inode); + setattr_copy(&nop_mnt_idmap, inode, attr); return 0; } -static int proc_sys_getattr(struct user_namespace *mnt_userns, +static int proc_sys_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct ctl_table_header *head = grab_header(inode); - struct ctl_table *table = PROC_I(inode)->sysctl_entry; + const struct ctl_table *table = PROC_I(inode)->sysctl_entry; if (IS_ERR(head)) return PTR_ERR(head); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (table) stat->mode = (stat->mode & S_IFMT) | table->mode; @@ -869,7 +859,7 @@ static const struct file_operations proc_sys_file_operations = { .poll = proc_sys_poll, .read_iter = proc_sys_read, .write_iter = proc_sys_write, - .splice_read = generic_file_splice_read, + .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .llseek = default_llseek, }; @@ -893,7 +883,8 @@ static const struct inode_operations proc_sys_dir_operations = { .getattr = proc_sys_getattr, }; -static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags) +static int proc_sys_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; @@ -926,17 +917,21 @@ static int proc_sys_compare(const struct dentry *dentry, struct ctl_table_header *head; struct inode *inode; - /* Although proc doesn't have negative dentries, rcu-walk means - * that inode here can be NULL */ - /* AV: can it, indeed? */ - inode = d_inode_rcu(dentry); - if (!inode) - return 1; if (name->len != len) return 1; if (memcmp(name->name, str, len)) return 1; - head = rcu_dereference(PROC_I(inode)->sysctl); + + // false positive is fine here - we'll recheck anyway + if (d_in_lookup(dentry)) + return 0; + + inode = d_inode_rcu(dentry); + // we just might have run into dentry in the middle of __dentry_kill() + if (!inode) + return 1; + + head = READ_ONCE(PROC_I(inode)->sysctl); return !head || !sysctl_is_seen(head); } @@ -950,7 +945,7 @@ static struct ctl_dir *find_subdir(struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; - struct ctl_table *entry; + const struct ctl_table *entry; entry = find_entry(&head, dir, name, namelen); if (!entry) @@ -969,18 +964,18 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set, char *new_name; new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) + - sizeof(struct ctl_table)*2 + namelen + 1, + sizeof(struct ctl_table) + namelen + 1, GFP_KERNEL); if (!new) return NULL; node = (struct ctl_node *)(new + 1); table = (struct ctl_table *)(node + 1); - new_name = (char *)(table + 2); + new_name = (char *)(table + 1); memcpy(new_name, name, namelen); table[0].procname = new_name; table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; - init_header(&new->header, set->dir.header.root, set, node, table); + init_header(&new->header, set->dir.header.root, set, node, table, 1); return new; } @@ -1061,12 +1056,12 @@ static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir) } static int sysctl_follow_link(struct ctl_table_header **phead, - struct ctl_table **pentry) + const struct ctl_table **pentry) { struct ctl_table_header *head; + const struct ctl_table *entry; struct ctl_table_root *root; struct ctl_table_set *set; - struct ctl_table *entry; struct ctl_dir *dir; int ret; @@ -1093,7 +1088,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead, return ret; } -static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) +static int sysctl_err(const char *path, const struct ctl_table *table, char *fmt, ...) { struct va_format vaf; va_list args; @@ -1109,8 +1104,9 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) return -EINVAL; } -static int sysctl_check_table_array(const char *path, struct ctl_table *table) +static int sysctl_check_table_array(const char *path, const struct ctl_table *table) { + unsigned int extra; int err = 0; if ((table->proc_handler == proc_douintvec) || @@ -1122,20 +1118,38 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table) if (table->proc_handler == proc_dou8vec_minmax) { if (table->maxlen != sizeof(u8)) err |= sysctl_err(path, table, "array not allowed"); + + if (table->extra1) { + extra = *(unsigned int *) table->extra1; + if (extra > 255U) + err |= sysctl_err(path, table, + "range value too large for proc_dou8vec_minmax"); + } + if (table->extra2) { + extra = *(unsigned int *) table->extra2; + if (extra > 255U) + err |= sysctl_err(path, table, + "range value too large for proc_dou8vec_minmax"); + } + } + + if (table->proc_handler == proc_dobool) { + if (table->maxlen != sizeof(bool)) + err |= sysctl_err(path, table, "array not allowed"); } return err; } -static int sysctl_check_table(const char *path, struct ctl_table *table) +static int sysctl_check_table(const char *path, struct ctl_table_header *header) { - struct ctl_table *entry; + const struct ctl_table *entry; int err = 0; - list_for_each_table_entry(entry, table) { - if (entry->child) - err |= sysctl_err(path, entry, "Not a file"); - + list_for_each_table_entry(entry, header) { + if (!entry->procname) + err |= sysctl_err(path, entry, "procname is null"); if ((entry->proc_handler == proc_dostring) || + (entry->proc_handler == proc_dobool) || (entry->proc_handler == proc_dointvec) || (entry->proc_handler == proc_douintvec) || (entry->proc_handler == proc_douintvec_minmax) || @@ -1163,25 +1177,23 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) return err; } -static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table, - struct ctl_table_root *link_root) +static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_header *head) { - struct ctl_table *link_table, *entry, *link; + struct ctl_table *link_table, *link; struct ctl_table_header *links; + const struct ctl_table *entry; struct ctl_node *node; char *link_name; - int nr_entries, name_bytes; + int name_bytes; name_bytes = 0; - nr_entries = 0; - list_for_each_table_entry(entry, table) { - nr_entries++; + list_for_each_table_entry(entry, head) { name_bytes += strlen(entry->procname) + 1; } links = kzalloc(sizeof(struct ctl_table_header) + - sizeof(struct ctl_node)*nr_entries + - sizeof(struct ctl_table)*(nr_entries + 1) + + sizeof(struct ctl_node)*head->ctl_table_size + + sizeof(struct ctl_table)*head->ctl_table_size + name_bytes, GFP_KERNEL); @@ -1189,35 +1201,41 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table return NULL; node = (struct ctl_node *)(links + 1); - link_table = (struct ctl_table *)(node + nr_entries); - link_name = (char *)&link_table[nr_entries + 1]; + link_table = (struct ctl_table *)(node + head->ctl_table_size); + link_name = (char *)(link_table + head->ctl_table_size); link = link_table; - list_for_each_table_entry(entry, table) { + list_for_each_table_entry(entry, head) { int len = strlen(entry->procname) + 1; memcpy(link_name, entry->procname, len); link->procname = link_name; link->mode = S_IFLNK|S_IRWXUGO; - link->data = link_root; + link->data = head->root; link_name += len; link++; } - init_header(links, dir->header.root, dir->header.set, node, link_table); - links->nreg = nr_entries; + init_header(links, dir->header.root, dir->header.set, node, link_table, + head->ctl_table_size); + links->nreg = head->ctl_table_size; return links; } static bool get_links(struct ctl_dir *dir, - struct ctl_table *table, struct ctl_table_root *link_root) + struct ctl_table_header *header, + struct ctl_table_root *link_root) { - struct ctl_table_header *head; - struct ctl_table *entry, *link; + struct ctl_table_header *tmp_head; + const struct ctl_table *entry, *link; + + if (header->ctl_table_size == 0 || + sysctl_is_perm_empty_ctl_header(header)) + return true; /* Are there links available for every entry in table? */ - list_for_each_table_entry(entry, table) { + list_for_each_table_entry(entry, header) { const char *procname = entry->procname; - link = find_entry(&head, dir, procname, strlen(procname)); + link = find_entry(&tmp_head, dir, procname, strlen(procname)); if (!link) return false; if (S_ISDIR(link->mode) && S_ISDIR(entry->mode)) @@ -1228,10 +1246,10 @@ static bool get_links(struct ctl_dir *dir, } /* The checks passed. Increase the registration count on the links */ - list_for_each_table_entry(entry, table) { + list_for_each_table_entry(entry, header) { const char *procname = entry->procname; - link = find_entry(&head, dir, procname, strlen(procname)); - head->nreg++; + link = find_entry(&tmp_head, dir, procname, strlen(procname)); + tmp_head->nreg++; } return true; } @@ -1250,13 +1268,13 @@ static int insert_links(struct ctl_table_header *head) if (IS_ERR(core_parent)) return 0; - if (get_links(core_parent, head->ctl_table, head->root)) + if (get_links(core_parent, head, head->root)) return 0; core_parent->header.nreg++; spin_unlock(&sysctl_lock); - links = new_links(core_parent, head->ctl_table, head->root); + links = new_links(core_parent, head); spin_lock(&sysctl_lock); err = -ENOMEM; @@ -1264,7 +1282,7 @@ static int insert_links(struct ctl_table_header *head) goto out; err = 0; - if (get_links(core_parent, head->ctl_table, head->root)) { + if (get_links(core_parent, head, head->root)) { kfree(links); goto out; } @@ -1277,34 +1295,64 @@ out: return err; } +/* Find the directory for the ctl_table. If one is not found create it. */ +static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path) +{ + const char *name, *nextname; + + for (name = path; name; name = nextname) { + int namelen; + nextname = strchr(name, '/'); + if (nextname) { + namelen = nextname - name; + nextname++; + } else { + namelen = strlen(name); + } + if (namelen == 0) + continue; + + /* + * namelen ensures if name is "foo/bar/yay" only foo is + * registered first. We traverse as if using mkdir -p and + * return a ctl_dir for the last directory entry. + */ + dir = get_subdir(dir, name, namelen); + if (IS_ERR(dir)) + break; + } + return dir; +} + /** * __register_sysctl_table - register a leaf sysctl table * @set: Sysctl tree to register on * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure + * + * @table: the top-level table structure. This table should not be free'd + * after registration. So it should not be used on stack. It can either + * be a global or dynamically allocated by the caller and free'd later + * after sysctl unregistration. + * @table_size : The number of elements in table * * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. + * array. * * The members of the &struct ctl_table structure are used as follows: - * * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not * enter a sysctl file - * - * data - a pointer to data for use by proc_handler - * - * maxlen - the maximum size in bytes of the data - * - * mode - the file permissions for the /proc/sys file - * - * child - must be %NULL. - * + * data - a pointer to data for use by proc_handler + * maxlen - the maximum size in bytes of the data + * mode - the file permissions for the /proc/sys file + * type - Defines the target type (described in struct definition) * proc_handler - the text handler routine (described below) * * extra1, extra2 - extra pointers usable by the proc handler routines + * XXX: we should eventually modify these to use long min / max [0] + * [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org * * Leaf nodes in the sysctl tree will be represented by a single file - * under /proc; non-leaf nodes will be represented by directories. + * under /proc; non-leaf nodes are not allowed. * * There must be a proc_handler routine for any terminal nodes. * Several default handlers are available to cover common cases - @@ -1321,53 +1369,32 @@ out: */ struct ctl_table_header *__register_sysctl_table( struct ctl_table_set *set, - const char *path, struct ctl_table *table) + const char *path, const struct ctl_table *table, size_t table_size) { struct ctl_table_root *root = set->dir.header.root; struct ctl_table_header *header; - const char *name, *nextname; struct ctl_dir *dir; - struct ctl_table *entry; struct ctl_node *node; - int nr_entries = 0; - - list_for_each_table_entry(entry, table) - nr_entries++; header = kzalloc(sizeof(struct ctl_table_header) + - sizeof(struct ctl_node)*nr_entries, GFP_KERNEL_ACCOUNT); + sizeof(struct ctl_node)*table_size, GFP_KERNEL_ACCOUNT); if (!header) return NULL; node = (struct ctl_node *)(header + 1); - init_header(header, root, set, node, table); - if (sysctl_check_table(path, table)) + init_header(header, root, set, node, table, table_size); + if (sysctl_check_table(path, header)) goto fail; spin_lock(&sysctl_lock); dir = &set->dir; - /* Reference moved down the diretory tree get_subdir */ + /* Reference moved down the directory tree get_subdir */ dir->header.nreg++; spin_unlock(&sysctl_lock); - /* Find the directory for the ctl_table */ - for (name = path; name; name = nextname) { - int namelen; - nextname = strchr(name, '/'); - if (nextname) { - namelen = nextname - name; - nextname++; - } else { - namelen = strlen(name); - } - if (namelen == 0) - continue; - - dir = get_subdir(dir, name, namelen); - if (IS_ERR(dir)) - goto fail; - } - + dir = sysctl_mkdir_p(dir, path); + if (IS_ERR(dir)) + goto fail; spin_lock(&sysctl_lock); if (insert_header(dir, header)) goto fail_put_dir_locked; @@ -1382,33 +1409,45 @@ fail_put_dir_locked: spin_unlock(&sysctl_lock); fail: kfree(header); - dump_stack(); return NULL; } /** - * register_sysctl - register a sysctl table - * @path: The path to the directory the sysctl table is in. - * @table: the table structure + * register_sysctl_sz - register a sysctl table + * @path: The path to the directory the sysctl table is in. If the path + * doesn't exist we will create it for you. + * @table: the table structure. The calller must ensure the life of the @table + * will be kept during the lifetime use of the syctl. It must not be freed + * until unregister_sysctl_table() is called with the given returned table + * with this registration. If your code is non modular then you don't need + * to call unregister_sysctl_table() and can instead use something like + * register_sysctl_init() which does not care for the result of the syctl + * registration. + * @table_size: The number of elements in table. * * Register a sysctl table. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. * * See __register_sysctl_table for more details. */ -struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table) +struct ctl_table_header *register_sysctl_sz(const char *path, const struct ctl_table *table, + size_t table_size) { return __register_sysctl_table(&sysctl_table_root.default_set, - path, table); + path, table, table_size); } -EXPORT_SYMBOL(register_sysctl); +EXPORT_SYMBOL(register_sysctl_sz); /** * __register_sysctl_init() - register sysctl table to path - * @path: path name for sysctl base - * @table: This is the sysctl table that needs to be registered to the path + * @path: path name for sysctl base. If that path doesn't exist we will create + * it for you. + * @table: This is the sysctl table that needs to be registered to the path. + * The caller must ensure the life of the @table will be kept during the + * lifetime use of the sysctl. * @table_name: The name of sysctl table, only used for log printing when * registration fails + * @table_size: The number of elements in table * * The sysctl interface is used by userspace to query or modify at runtime * a predefined value set on a variable. These variables however have default @@ -1418,254 +1457,27 @@ EXPORT_SYMBOL(register_sysctl); * register_sysctl() failing on init are extremely low, and so for both reasons * this function does not return any error as it is used by initialization code. * - * Context: Can only be called after your respective sysctl base path has been - * registered. So for instance, most base directories are registered early on - * init before init levels are processed through proc_sys_init() and - * sysctl_init_bases(). + * Context: if your base directory does not exist it will be created for you. */ -void __init __register_sysctl_init(const char *path, struct ctl_table *table, - const char *table_name) +void __init __register_sysctl_init(const char *path, const struct ctl_table *table, + const char *table_name, size_t table_size) { - struct ctl_table_header *hdr = register_sysctl(path, table); + struct ctl_table_header *hdr = register_sysctl_sz(path, table, table_size); if (unlikely(!hdr)) { - pr_err("failed when register_sysctl %s to %s\n", table_name, path); + pr_err("failed when register_sysctl_sz %s to %s\n", table_name, path); return; } kmemleak_not_leak(hdr); } -static char *append_path(const char *path, char *pos, const char *name) -{ - int namelen; - namelen = strlen(name); - if (((pos - path) + namelen + 2) >= PATH_MAX) - return NULL; - memcpy(pos, name, namelen); - pos[namelen] = '/'; - pos[namelen + 1] = '\0'; - pos += namelen + 1; - return pos; -} - -static int count_subheaders(struct ctl_table *table) -{ - int has_files = 0; - int nr_subheaders = 0; - struct ctl_table *entry; - - /* special case: no directory and empty directory */ - if (!table || !table->procname) - return 1; - - list_for_each_table_entry(entry, table) { - if (entry->child) - nr_subheaders += count_subheaders(entry->child); - else - has_files = 1; - } - return nr_subheaders + has_files; -} - -static int register_leaf_sysctl_tables(const char *path, char *pos, - struct ctl_table_header ***subheader, struct ctl_table_set *set, - struct ctl_table *table) -{ - struct ctl_table *ctl_table_arg = NULL; - struct ctl_table *entry, *files; - int nr_files = 0; - int nr_dirs = 0; - int err = -ENOMEM; - - list_for_each_table_entry(entry, table) { - if (entry->child) - nr_dirs++; - else - nr_files++; - } - - files = table; - /* If there are mixed files and directories we need a new table */ - if (nr_dirs && nr_files) { - struct ctl_table *new; - files = kcalloc(nr_files + 1, sizeof(struct ctl_table), - GFP_KERNEL); - if (!files) - goto out; - - ctl_table_arg = files; - new = files; - - list_for_each_table_entry(entry, table) { - if (entry->child) - continue; - *new = *entry; - new++; - } - } - - /* Register everything except a directory full of subdirectories */ - if (nr_files || !nr_dirs) { - struct ctl_table_header *header; - header = __register_sysctl_table(set, path, files); - if (!header) { - kfree(ctl_table_arg); - goto out; - } - - /* Remember if we need to free the file table */ - header->ctl_table_arg = ctl_table_arg; - **subheader = header; - (*subheader)++; - } - - /* Recurse into the subdirectories. */ - list_for_each_table_entry(entry, table) { - char *child_pos; - - if (!entry->child) - continue; - - err = -ENAMETOOLONG; - child_pos = append_path(path, pos, entry->procname); - if (!child_pos) - goto out; - - err = register_leaf_sysctl_tables(path, child_pos, subheader, - set, entry->child); - pos[0] = '\0'; - if (err) - goto out; - } - err = 0; -out: - /* On failure our caller will unregister all registered subheaders */ - return err; -} - -/** - * __register_sysctl_paths - register a sysctl table hierarchy - * @set: Sysctl tree to register on - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See __register_sysctl_table for more details. - */ -struct ctl_table_header *__register_sysctl_paths( - struct ctl_table_set *set, - const struct ctl_path *path, struct ctl_table *table) -{ - struct ctl_table *ctl_table_arg = table; - int nr_subheaders = count_subheaders(table); - struct ctl_table_header *header = NULL, **subheaders, **subheader; - const struct ctl_path *component; - char *new_path, *pos; - - pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL); - if (!new_path) - return NULL; - - pos[0] = '\0'; - for (component = path; component->procname; component++) { - pos = append_path(new_path, pos, component->procname); - if (!pos) - goto out; - } - while (table->procname && table->child && !table[1].procname) { - pos = append_path(new_path, pos, table->procname); - if (!pos) - goto out; - table = table->child; - } - if (nr_subheaders == 1) { - header = __register_sysctl_table(set, new_path, table); - if (header) - header->ctl_table_arg = ctl_table_arg; - } else { - header = kzalloc(sizeof(*header) + - sizeof(*subheaders)*nr_subheaders, GFP_KERNEL); - if (!header) - goto out; - - subheaders = (struct ctl_table_header **) (header + 1); - subheader = subheaders; - header->ctl_table_arg = ctl_table_arg; - - if (register_leaf_sysctl_tables(new_path, pos, &subheader, - set, table)) - goto err_register_leaves; - } - -out: - kfree(new_path); - return header; - -err_register_leaves: - while (subheader > subheaders) { - struct ctl_table_header *subh = *(--subheader); - struct ctl_table *table = subh->ctl_table_arg; - unregister_sysctl_table(subh); - kfree(table); - } - kfree(header); - header = NULL; - goto out; -} - -/** - * register_sysctl_paths - register a sysctl table hierarchy - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See __register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, - struct ctl_table *table) -{ - return __register_sysctl_paths(&sysctl_table_root.default_set, - path, table); -} -EXPORT_SYMBOL(register_sysctl_paths); - -/** - * register_sysctl_table - register a sysctl table hierarchy - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_table(struct ctl_table *table) -{ - static const struct ctl_path null_path[] = { {} }; - - return register_sysctl_paths(null_path, table); -} -EXPORT_SYMBOL(register_sysctl_table); - -int __register_sysctl_base(struct ctl_table *base_table) -{ - struct ctl_table_header *hdr; - - hdr = register_sysctl_table(base_table); - kmemleak_not_leak(hdr); - return 0; -} - static void put_links(struct ctl_table_header *header) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; struct ctl_table_root *root = header->root; struct ctl_dir *parent = header->parent; struct ctl_dir *core_parent; - struct ctl_table *entry; + const struct ctl_table *entry; if (header->set == root_set) return; @@ -1674,9 +1486,9 @@ static void put_links(struct ctl_table_header *header) if (IS_ERR(core_parent)) return; - list_for_each_table_entry(entry, header->ctl_table) { + list_for_each_table_entry(entry, header) { struct ctl_table_header *link_head; - struct ctl_table *link; + const struct ctl_table *link; const char *name = entry->procname; link = find_entry(&link_head, core_parent, name, strlen(name)); @@ -1714,35 +1526,18 @@ static void drop_sysctl_table(struct ctl_table_header *header) /** * unregister_sysctl_table - unregister a sysctl table hierarchy - * @header: the header returned from register_sysctl_table + * @header: the header returned from register_sysctl or __register_sysctl_table * * Unregisters the sysctl table and all children. proc entries may not * actually be removed until they are no longer used by anyone. */ void unregister_sysctl_table(struct ctl_table_header * header) { - int nr_subheaders; might_sleep(); if (header == NULL) return; - nr_subheaders = count_subheaders(header->ctl_table_arg); - if (unlikely(nr_subheaders > 1)) { - struct ctl_table_header **subheaders; - int i; - - subheaders = (struct ctl_table_header **)(header + 1); - for (i = nr_subheaders -1; i >= 0; i--) { - struct ctl_table_header *subh = subheaders[i]; - struct ctl_table *table = subh->ctl_table_arg; - unregister_sysctl_table(subh); - kfree(table); - } - kfree(header); - return; - } - spin_lock(&sysctl_lock); drop_sysctl_table(header); spin_unlock(&sysctl_lock); @@ -1755,7 +1550,7 @@ void setup_sysctl_set(struct ctl_table_set *set, { memset(set, 0, sizeof(*set)); set->is_seen = is_seen; - init_header(&set->dir.header, root, set, NULL, root_table); + init_header(&set->dir.header, root, set, NULL, root_table, 1); } void retire_sysctl_set(struct ctl_table_set *set) @@ -1794,7 +1589,6 @@ static const struct sysctl_alias sysctl_aliases[] = { {"hung_task_panic", "kernel.hung_task_panic" }, {"numa_zonelist_order", "vm.numa_zonelist_order" }, {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" }, - {"softlockup_panic", "kernel.softlockup_panic" }, { } }; @@ -1810,6 +1604,13 @@ static const char *sysctl_find_alias(char *param) return NULL; } +bool sysctl_is_alias(char *param) +{ + const char *alias = sysctl_find_alias(param); + + return alias != NULL; +} + /* Set sysctl value passed on kernel command line. */ static int process_sysctl_arg(char *param, char *val, const char *unused, void *arg) diff --git a/fs/proc/root.c b/fs/proc/root.c index 3c2ee3eb1138..d8ca41d823e4 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -38,12 +38,14 @@ enum proc_param { Opt_gid, Opt_hidepid, Opt_subset, + Opt_pidns, }; static const struct fs_parameter_spec proc_fs_parameters[] = { - fsparam_u32("gid", Opt_gid), + fsparam_u32("gid", Opt_gid), fsparam_string("hidepid", Opt_hidepid), fsparam_string("subset", Opt_subset), + fsparam_file_or_string("pidns", Opt_pidns), {} }; @@ -109,11 +111,66 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value) return 0; } +#ifdef CONFIG_PID_NS +static int proc_parse_pidns_param(struct fs_context *fc, + struct fs_parameter *param, + struct fs_parse_result *result) +{ + struct proc_fs_context *ctx = fc->fs_private; + struct pid_namespace *target, *active = task_active_pid_ns(current); + struct ns_common *ns; + struct file *ns_filp __free(fput) = NULL; + + switch (param->type) { + case fs_value_is_file: + /* came through fsconfig, steal the file reference */ + ns_filp = no_free_ptr(param->file); + break; + case fs_value_is_string: + ns_filp = filp_open(param->string, O_RDONLY, 0); + break; + default: + WARN_ON_ONCE(true); + break; + } + if (!ns_filp) + ns_filp = ERR_PTR(-EBADF); + if (IS_ERR(ns_filp)) { + errorfc(fc, "could not get file from pidns argument"); + return PTR_ERR(ns_filp); + } + + if (!proc_ns_file(ns_filp)) + return invalfc(fc, "pidns argument is not an nsfs file"); + ns = get_proc_ns(file_inode(ns_filp)); + if (ns->ns_type != CLONE_NEWPID) + return invalfc(fc, "pidns argument is not a pidns file"); + target = container_of(ns, struct pid_namespace, ns); + + /* + * pidns= is shorthand for joining the pidns to get a fsopen fd, so the + * permission model should be the same as pidns_install(). + */ + if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) { + errorfc(fc, "insufficient permissions to set pidns"); + return -EPERM; + } + if (!pidns_is_ancestor(target, active)) + return invalfc(fc, "cannot set pidns to non-descendant pidns"); + + put_pid_ns(ctx->pid_ns); + ctx->pid_ns = get_pid_ns(target); + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); + return 0; +} +#endif /* CONFIG_PID_NS */ + static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct proc_fs_context *ctx = fc->fs_private; struct fs_parse_result result; - int opt; + int opt, err; opt = fs_parse(fc, proc_fs_parameters, param, &result); if (opt < 0) @@ -125,14 +182,38 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_hidepid: - if (proc_parse_hidepid_param(fc, param)) - return -EINVAL; + err = proc_parse_hidepid_param(fc, param); + if (err) + return err; break; case Opt_subset: - if (proc_parse_subset_param(fc, param->string) < 0) - return -EINVAL; + err = proc_parse_subset_param(fc, param->string); + if (err) + return err; + break; + + case Opt_pidns: +#ifdef CONFIG_PID_NS + /* + * We would have to RCU-protect every proc_pid_ns() or + * proc_sb_info() access if we allowed this to be reconfigured + * for an existing procfs instance. Luckily, procfs instances + * are cheap to create, and mount-beneath would let you + * atomically replace an instance even with overmounts. + */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + errorfc(fc, "cannot reconfigure pidns for existing procfs"); + return -EBUSY; + } + err = proc_parse_pidns_param(fc, param, &result); + if (err) + return err; break; +#else + errorfc(fc, "pidns mount flag not supported on this system"); + return -EOPNOTSUPP; +#endif default: return -EINVAL; @@ -154,6 +235,11 @@ static void proc_apply_options(struct proc_fs_info *fs_info, fs_info->hide_pid = ctx->hidepid; if (ctx->mask & (1 << Opt_subset)) fs_info->pidonly = ctx->pidonly; + if (ctx->mask & (1 << Opt_pidns) && + !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) { + put_pid_ns(fs_info->pid_ns); + fs_info->pid_ns = get_pid_ns(ctx->pid_ns); + } } static int proc_fill_super(struct super_block *s, struct fs_context *fc) @@ -188,7 +274,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; /* procfs dentries and inodes don't require IO to create */ - s->s_shrink.seeks = 0; + s->s_shrink->seeks = 0; pde_get(&proc_root); root_inode = proc_get_inode(s, &proc_root); @@ -261,17 +347,11 @@ static void proc_kill_sb(struct super_block *sb) { struct proc_fs_info *fs_info = proc_sb_info(sb); - if (!fs_info) { - kill_anon_super(sb); - return; - } - - dput(fs_info->proc_self); - dput(fs_info->proc_thread_self); - kill_anon_super(sb); - put_pid_ns(fs_info->pid_ns); - kfree(fs_info); + if (fs_info) { + put_pid_ns(fs_info->pid_ns); + kfree_rcu(fs_info, rcu); + } } static struct file_system_type proc_fs_type = { @@ -310,11 +390,12 @@ void __init proc_root_init(void) register_filesystem(&proc_fs_type); } -static int proc_root_getattr(struct user_namespace *mnt_userns, +static int proc_root_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), + stat); stat->nlink = proc_root.nlink + nr_processes(); return 0; } @@ -362,12 +443,12 @@ static const struct inode_operations proc_root_inode_operations = { * This is the root "inode" in the /proc tree.. */ struct proc_dir_entry proc_root = { - .low_ino = PROC_ROOT_INO, - .namelen = 5, - .mode = S_IFDIR | S_IRUGO | S_IXUGO, - .nlink = 2, + .low_ino = PROCFS_ROOT_INO, + .namelen = 5, + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .nlink = 2, .refcnt = REFCOUNT_INIT(1), - .proc_iops = &proc_root_inode_operations, + .proc_iops = &proc_root_inode_operations, .proc_dir_ops = &proc_root_operations, .parent = &proc_root, .subdir = RB_ROOT, diff --git a/fs/proc/self.c b/fs/proc/self.c index 72cd69bcaf4a..62d2c0cfe35c 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -31,12 +31,11 @@ static const struct inode_operations proc_self_inode_operations = { .get_link = proc_self_get_link, }; -static unsigned self_inum __ro_after_init; +unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct proc_fs_info *fs_info = proc_sb_info(s); struct dentry *self; int ret = -ENOMEM; @@ -46,23 +45,20 @@ int proc_setup_self(struct super_block *s) struct inode *inode = new_inode(s); if (inode) { inode->i_ino = self_inum; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_self_inode_operations; - d_add(self, inode); + d_make_persistent(self, inode); ret = 0; - } else { - dput(self); } + dput(self); } inode_unlock(root_inode); if (ret) pr_err("proc_fill_super: can't allocate /proc/self\n"); - else - fs_info->proc_self = self; return ret; } diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index f4616083faef..04bb29721419 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -20,7 +20,7 @@ static int show_softirqs(struct seq_file *p, void *v) for (i = 0; i < NR_SOFTIRQS; i++) { seq_printf(p, "%12s:", softirq_to_name[i]); for_each_possible_cpu(j) - seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); + seq_put_decimal_ull_width(p, " ", kstat_softirqs_cpu(i, j), 10); seq_putc(p, '\n'); } return 0; diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 4fb8729a68d4..8b444e862319 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -22,30 +22,6 @@ #define arch_irq_stat() 0 #endif -#ifdef arch_idle_time - -u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) -{ - u64 idle; - - idle = kcs->cpustat[CPUTIME_IDLE]; - if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) - idle += arch_idle_time(cpu); - return idle; -} - -static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) -{ - u64 iowait; - - iowait = kcs->cpustat[CPUTIME_IOWAIT]; - if (cpu_online(cpu) && nr_iowait_cpu(cpu)) - iowait += arch_idle_time(cpu); - return iowait; -} - -#else - u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle, idle_usecs = -1ULL; @@ -78,8 +54,6 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) return iowait; } -#endif - static void show_irq_gap(struct seq_file *p, unsigned int gap) { static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"; @@ -102,7 +76,7 @@ static void show_all_irqs(struct seq_file *p) seq_put_decimal_ull(p, " ", kstat_irqs_usr(i)); next = i + 1; } - show_irq_gap(p, nr_irqs - next); + show_irq_gap(p, irq_get_nr_irqs() - next); } static int show_stat(struct seq_file *p, void *v) @@ -222,7 +196,7 @@ static int stat_open(struct inode *inode, struct file *file) unsigned int size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ - size += 2 * nr_irqs; + size += 2 * irq_get_nr_irqs(); return single_open_size(file, show_stat, NULL, size); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e35a0398db63..81dfc26bfae8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -4,6 +4,7 @@ #include <linux/hugetlb.h> #include <linux/huge_mm.h> #include <linux/mount.h> +#include <linux/ksm.h> #include <linux/seq_file.h> #include <linux/highmem.h> #include <linux/ptrace.h> @@ -13,18 +14,24 @@ #include <linux/rmap.h> #include <linux/swap.h> #include <linux/sched/mm.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> #include <linux/shmem_fs.h> #include <linux/uaccess.h> #include <linux/pkeys.h> +#include <linux/minmax.h> +#include <linux/overflow.h> +#include <linux/buildid.h> #include <asm/elf.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include "internal.h" +#define SENTINEL_VMA_END -1 +#define SENTINEL_VMA_GATE -2 + #define SEQ_PUT_DEC(str, val) \ seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) void task_mem(struct seq_file *m, struct mm_struct *mm) @@ -32,9 +39,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) unsigned long text, lib, swap, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; - anon = get_mm_counter(mm, MM_ANONPAGES); - file = get_mm_counter(mm, MM_FILEPAGES); - shmem = get_mm_counter(mm, MM_SHMEMPAGES); + anon = get_mm_counter_sum(mm, MM_ANONPAGES); + file = get_mm_counter_sum(mm, MM_FILEPAGES); + shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES); /* * Note: to minimize their overhead, mm maintains hiwater_vm and @@ -55,7 +62,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = min(text, mm->exec_vm << PAGE_SHIFT); lib = (mm->exec_vm << PAGE_SHIFT) - text; - swap = get_mm_counter(mm, MM_SWAPENTS); + swap = get_mm_counter_sum(mm, MM_SWAPENTS); SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); @@ -88,12 +95,12 @@ unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { - *shared = get_mm_counter(mm, MM_FILEPAGES) + - get_mm_counter(mm, MM_SHMEMPAGES); + *shared = get_mm_counter_sum(mm, MM_FILEPAGES) + + get_mm_counter_sum(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->data_vm + mm->stack_vm; - *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); + *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES); return mm->total_vm; } @@ -123,16 +130,143 @@ static void release_task_mempolicy(struct proc_maps_private *priv) } #endif -static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, - loff_t *ppos) +#ifdef CONFIG_PER_VMA_LOCK + +static void reset_lock_ctx(struct proc_maps_locking_ctx *lock_ctx) +{ + lock_ctx->locked_vma = NULL; + lock_ctx->mmap_locked = false; +} + +static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx) +{ + if (lock_ctx->locked_vma) { + vma_end_read(lock_ctx->locked_vma); + lock_ctx->locked_vma = NULL; + } +} + +static const struct seq_operations proc_pid_maps_op; + +static inline bool lock_vma_range(struct seq_file *m, + struct proc_maps_locking_ctx *lock_ctx) +{ + /* + * smaps and numa_maps perform page table walk, therefore require + * mmap_lock but maps can be read with locking just the vma and + * walking the vma tree under rcu read protection. + */ + if (m->op != &proc_pid_maps_op) { + if (mmap_read_lock_killable(lock_ctx->mm)) + return false; + + lock_ctx->mmap_locked = true; + } else { + rcu_read_lock(); + reset_lock_ctx(lock_ctx); + } + + return true; +} + +static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) +{ + if (lock_ctx->mmap_locked) { + mmap_read_unlock(lock_ctx->mm); + } else { + unlock_ctx_vma(lock_ctx); + rcu_read_unlock(); + } +} + +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, + loff_t last_pos) +{ + struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx; + struct vm_area_struct *vma; + + if (lock_ctx->mmap_locked) + return vma_next(&priv->iter); + + unlock_ctx_vma(lock_ctx); + vma = lock_next_vma(lock_ctx->mm, &priv->iter, last_pos); + if (!IS_ERR_OR_NULL(vma)) + lock_ctx->locked_vma = vma; + + return vma; +} + +static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, + loff_t pos) +{ + struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx; + + if (lock_ctx->mmap_locked) + return false; + + rcu_read_unlock(); + mmap_read_lock(lock_ctx->mm); + /* Reinitialize the iterator after taking mmap_lock */ + vma_iter_set(&priv->iter, pos); + lock_ctx->mmap_locked = true; + + return true; +} + +#else /* CONFIG_PER_VMA_LOCK */ + +static inline bool lock_vma_range(struct seq_file *m, + struct proc_maps_locking_ctx *lock_ctx) +{ + return mmap_read_lock_killable(lock_ctx->mm) == 0; +} + +static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) +{ + mmap_read_unlock(lock_ctx->mm); +} + +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, + loff_t last_pos) +{ + return vma_next(&priv->iter); +} + +static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, + loff_t pos) +{ + return false; +} + +#endif /* CONFIG_PER_VMA_LOCK */ + +static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) { - struct vm_area_struct *vma = vma_next(&priv->iter); + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma; + +retry: + vma = get_next_vma(priv, *ppos); + /* EINTR of EAGAIN is possible */ + if (IS_ERR(vma)) { + if (PTR_ERR(vma) == -EAGAIN && fallback_to_mmap_lock(priv, *ppos)) + goto retry; + + return vma; + } + /* Store previous position to be able to restart if needed */ + priv->last_pos = *ppos; if (vma) { - *ppos = vma->vm_start; + /* + * Track the end of the reported vma to ensure position changes + * even if previous vma was merged with the next vma and we + * found the extended vma with the same vm_start. + */ + *ppos = vma->vm_end; } else { - *ppos = -2UL; - vma = get_gate_vma(priv->mm); + *ppos = SENTINEL_VMA_GATE; + vma = get_gate_vma(priv->lock_ctx.mm); } return vma; @@ -141,58 +275,66 @@ static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; - unsigned long last_addr = *ppos; + struct proc_maps_locking_ctx *lock_ctx; + loff_t last_addr = *ppos; struct mm_struct *mm; /* See m_next(). Zero at the start or after lseek. */ - if (last_addr == -1UL) + if (last_addr == SENTINEL_VMA_END) return NULL; priv->task = get_proc_task(priv->inode); if (!priv->task) return ERR_PTR(-ESRCH); - mm = priv->mm; + lock_ctx = &priv->lock_ctx; + mm = lock_ctx->mm; if (!mm || !mmget_not_zero(mm)) { put_task_struct(priv->task); priv->task = NULL; return NULL; } - if (mmap_read_lock_killable(mm)) { + if (!lock_vma_range(m, lock_ctx)) { mmput(mm); put_task_struct(priv->task); priv->task = NULL; return ERR_PTR(-EINTR); } - vma_iter_init(&priv->iter, mm, last_addr); + /* + * Reset current position if last_addr was set before + * and it's not a sentinel. + */ + if (last_addr > 0) + *ppos = last_addr = priv->last_pos; + vma_iter_init(&priv->iter, mm, (unsigned long)last_addr); hold_task_mempolicy(priv); - if (last_addr == -2UL) + if (last_addr == SENTINEL_VMA_GATE) return get_gate_vma(mm); - return proc_get_vma(priv, ppos); + return proc_get_vma(m, ppos); } static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { - if (*ppos == -2UL) { - *ppos = -1UL; + if (*ppos == SENTINEL_VMA_GATE) { + *ppos = SENTINEL_VMA_END; return NULL; } - return proc_get_vma(m->private, ppos); + return proc_get_vma(m, ppos); } static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; - struct mm_struct *mm = priv->mm; + struct mm_struct *mm = priv->lock_ctx.mm; if (!priv->task) return; release_task_mempolicy(priv); - mmap_read_unlock(mm); + unlock_vma_range(&priv->lock_ctx); mmput(mm); put_task_struct(priv->task); priv->task = NULL; @@ -207,9 +349,9 @@ static int proc_maps_open(struct inode *inode, struct file *file, return -ENOMEM; priv->inode = inode; - priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - int err = PTR_ERR(priv->mm); + priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(priv->lock_ctx.mm)) { + int err = PTR_ERR(priv->lock_ctx.mm); seq_release_private(inode, file); return err; @@ -223,8 +365,8 @@ static int proc_map_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; - if (priv->mm) - mmdrop(priv->mm); + if (priv->lock_ctx.mm) + mmdrop(priv->lock_ctx.mm); return seq_release_private(inode, file); } @@ -236,19 +378,65 @@ static int do_maps_open(struct inode *inode, struct file *file, sizeof(struct proc_maps_private)); } -/* - * Indicate if the VMA is a stack for the given task; for - * /proc/PID/maps that is the stack of the main task. - */ -static int is_stack(struct vm_area_struct *vma) +static void get_vma_name(struct vm_area_struct *vma, + const struct path **path, + const char **name, + const char **name_fmt) { + struct anon_vma_name *anon_name = vma->vm_mm ? anon_vma_name(vma) : NULL; + + *name = NULL; + *path = NULL; + *name_fmt = NULL; + /* - * We make no effort to guess what a given thread considers to be - * its "stack". It's not even well-defined for programs written - * languages like Go. + * Print the dentry name for named mappings, and a + * special [heap] marker for the heap: */ - return vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack; + if (vma->vm_file) { + /* + * If user named this anon shared memory via + * prctl(PR_SET_VMA ..., use the provided name. + */ + if (anon_name) { + *name_fmt = "[anon_shmem:%s]"; + *name = anon_name->name; + } else { + *path = file_user_path(vma->vm_file); + } + return; + } + + if (vma->vm_ops && vma->vm_ops->name) { + *name = vma->vm_ops->name(vma); + if (*name) + return; + } + + *name = arch_vma_name(vma); + if (*name) + return; + + if (!vma->vm_mm) { + *name = "[vdso]"; + return; + } + + if (vma_is_initial_heap(vma)) { + *name = "[heap]"; + return; + } + + if (vma_is_initial_stack(vma)) { + *name = "[stack]"; + return; + } + + if (anon_name) { + *name_fmt = "[anon:%s]"; + *name = anon_name->name; + return; + } } static void show_vma_header_prefix(struct seq_file *m, @@ -274,18 +462,17 @@ static void show_vma_header_prefix(struct seq_file *m, static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { - struct anon_vma_name *anon_name = NULL; - struct mm_struct *mm = vma->vm_mm; - struct file *file = vma->vm_file; + const struct path *path; + const char *name_fmt, *name; vm_flags_t flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; unsigned long start, end; dev_t dev = 0; - const char *name = NULL; - if (file) { - struct inode *inode = file_inode(vma->vm_file); + if (vma->vm_file) { + const struct inode *inode = file_user_inode(vma->vm_file); + dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; @@ -294,58 +481,15 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) start = vma->vm_start; end = vma->vm_end; show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); - if (mm) - anon_name = anon_vma_name(vma); - /* - * Print the dentry name for named mappings, and a - * special [heap] marker for the heap: - */ - if (file) { + get_vma_name(vma, &path, &name, &name_fmt); + if (path) { seq_pad(m, ' '); - /* - * If user named this anon shared memory via - * prctl(PR_SET_VMA ..., use the provided name. - */ - if (anon_name) - seq_printf(m, "[anon_shmem:%s]", anon_name->name); - else - seq_file_path(m, file, "\n"); - goto done; - } - - if (vma->vm_ops && vma->vm_ops->name) { - name = vma->vm_ops->name(vma); - if (name) - goto done; - } - - name = arch_vma_name(vma); - if (!name) { - if (!mm) { - name = "[vdso]"; - goto done; - } - - if (vma->vm_start <= mm->brk && - vma->vm_end >= mm->start_brk) { - name = "[heap]"; - goto done; - } - - if (is_stack(vma)) { - name = "[stack]"; - goto done; - } - - if (anon_name) { - seq_pad(m, ' '); - seq_printf(m, "[anon:%s]", anon_name->name); - } - } - -done: - if (name) { + seq_path(m, path, "\n"); + } else if (name_fmt) { + seq_pad(m, ' '); + seq_printf(m, name_fmt, name); + } else if (name) { seq_pad(m, ' '); seq_puts(m, name); } @@ -370,11 +514,315 @@ static int pid_maps_open(struct inode *inode, struct file *file) return do_maps_open(inode, file, &proc_pid_maps_op); } +#define PROCMAP_QUERY_VMA_FLAGS ( \ + PROCMAP_QUERY_VMA_READABLE | \ + PROCMAP_QUERY_VMA_WRITABLE | \ + PROCMAP_QUERY_VMA_EXECUTABLE | \ + PROCMAP_QUERY_VMA_SHARED \ +) + +#define PROCMAP_QUERY_VALID_FLAGS_MASK ( \ + PROCMAP_QUERY_COVERING_OR_NEXT_VMA | \ + PROCMAP_QUERY_FILE_BACKED_VMA | \ + PROCMAP_QUERY_VMA_FLAGS \ +) + +#ifdef CONFIG_PER_VMA_LOCK + +static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) +{ + reset_lock_ctx(lock_ctx); + + return 0; +} + +static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) +{ + if (lock_ctx->mmap_locked) { + mmap_read_unlock(lock_ctx->mm); + lock_ctx->mmap_locked = false; + } else { + unlock_ctx_vma(lock_ctx); + } +} + +static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx, + unsigned long addr) +{ + struct mm_struct *mm = lock_ctx->mm; + struct vm_area_struct *vma; + struct vma_iterator vmi; + + if (lock_ctx->mmap_locked) + return find_vma(mm, addr); + + /* Unlock previously locked VMA and find the next one under RCU */ + unlock_ctx_vma(lock_ctx); + rcu_read_lock(); + vma_iter_init(&vmi, mm, addr); + vma = lock_next_vma(mm, &vmi, addr); + rcu_read_unlock(); + + if (!vma) + return NULL; + + if (!IS_ERR(vma)) { + lock_ctx->locked_vma = vma; + return vma; + } + + if (PTR_ERR(vma) == -EAGAIN) { + /* Fallback to mmap_lock on vma->vm_refcnt overflow */ + mmap_read_lock(mm); + vma = find_vma(mm, addr); + lock_ctx->mmap_locked = true; + } + + return vma; +} + +#else /* CONFIG_PER_VMA_LOCK */ + +static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) +{ + return mmap_read_lock_killable(lock_ctx->mm); +} + +static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) +{ + mmap_read_unlock(lock_ctx->mm); +} + +static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx, + unsigned long addr) +{ + return find_vma(lock_ctx->mm, addr); +} + +#endif /* CONFIG_PER_VMA_LOCK */ + +static struct vm_area_struct *query_matching_vma(struct proc_maps_locking_ctx *lock_ctx, + unsigned long addr, u32 flags) +{ + struct vm_area_struct *vma; + +next_vma: + vma = query_vma_find_by_addr(lock_ctx, addr); + if (IS_ERR(vma)) + return vma; + + if (!vma) + goto no_vma; + + /* user requested only file-backed VMA, keep iterating */ + if ((flags & PROCMAP_QUERY_FILE_BACKED_VMA) && !vma->vm_file) + goto skip_vma; + + /* VMA permissions should satisfy query flags */ + if (flags & PROCMAP_QUERY_VMA_FLAGS) { + u32 perm = 0; + + if (flags & PROCMAP_QUERY_VMA_READABLE) + perm |= VM_READ; + if (flags & PROCMAP_QUERY_VMA_WRITABLE) + perm |= VM_WRITE; + if (flags & PROCMAP_QUERY_VMA_EXECUTABLE) + perm |= VM_EXEC; + if (flags & PROCMAP_QUERY_VMA_SHARED) + perm |= VM_MAYSHARE; + + if ((vma->vm_flags & perm) != perm) + goto skip_vma; + } + + /* found covering VMA or user is OK with the matching next VMA */ + if ((flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) || vma->vm_start <= addr) + return vma; + +skip_vma: + /* + * If the user needs closest matching VMA, keep iterating. + */ + addr = vma->vm_end; + if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) + goto next_vma; + +no_vma: + return ERR_PTR(-ENOENT); +} + +static int do_procmap_query(struct mm_struct *mm, void __user *uarg) +{ + struct proc_maps_locking_ctx lock_ctx = { .mm = mm }; + struct procmap_query karg; + struct vm_area_struct *vma; + const char *name = NULL; + char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL; + __u64 usize; + int err; + + if (copy_from_user(&usize, (void __user *)uarg, sizeof(usize))) + return -EFAULT; + /* argument struct can never be that large, reject abuse */ + if (usize > PAGE_SIZE) + return -E2BIG; + /* argument struct should have at least query_flags and query_addr fields */ + if (usize < offsetofend(struct procmap_query, query_addr)) + return -EINVAL; + err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); + if (err) + return err; + + /* reject unknown flags */ + if (karg.query_flags & ~PROCMAP_QUERY_VALID_FLAGS_MASK) + return -EINVAL; + /* either both buffer address and size are set, or both should be zero */ + if (!!karg.vma_name_size != !!karg.vma_name_addr) + return -EINVAL; + if (!!karg.build_id_size != !!karg.build_id_addr) + return -EINVAL; + + if (!mm || !mmget_not_zero(mm)) + return -ESRCH; + + err = query_vma_setup(&lock_ctx); + if (err) { + mmput(mm); + return err; + } + + vma = query_matching_vma(&lock_ctx, karg.query_addr, karg.query_flags); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + vma = NULL; + goto out; + } + + karg.vma_start = vma->vm_start; + karg.vma_end = vma->vm_end; + + karg.vma_flags = 0; + if (vma->vm_flags & VM_READ) + karg.vma_flags |= PROCMAP_QUERY_VMA_READABLE; + if (vma->vm_flags & VM_WRITE) + karg.vma_flags |= PROCMAP_QUERY_VMA_WRITABLE; + if (vma->vm_flags & VM_EXEC) + karg.vma_flags |= PROCMAP_QUERY_VMA_EXECUTABLE; + if (vma->vm_flags & VM_MAYSHARE) + karg.vma_flags |= PROCMAP_QUERY_VMA_SHARED; + + karg.vma_page_size = vma_kernel_pagesize(vma); + + if (vma->vm_file) { + const struct inode *inode = file_user_inode(vma->vm_file); + + karg.vma_offset = ((__u64)vma->vm_pgoff) << PAGE_SHIFT; + karg.dev_major = MAJOR(inode->i_sb->s_dev); + karg.dev_minor = MINOR(inode->i_sb->s_dev); + karg.inode = inode->i_ino; + } else { + karg.vma_offset = 0; + karg.dev_major = 0; + karg.dev_minor = 0; + karg.inode = 0; + } + + if (karg.build_id_size) { + __u32 build_id_sz; + + err = build_id_parse(vma, build_id_buf, &build_id_sz); + if (err) { + karg.build_id_size = 0; + } else { + if (karg.build_id_size < build_id_sz) { + err = -ENAMETOOLONG; + goto out; + } + karg.build_id_size = build_id_sz; + } + } + + if (karg.vma_name_size) { + size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size); + const struct path *path; + const char *name_fmt; + size_t name_sz = 0; + + get_vma_name(vma, &path, &name, &name_fmt); + + if (path || name_fmt || name) { + name_buf = kmalloc(name_buf_sz, GFP_KERNEL); + if (!name_buf) { + err = -ENOMEM; + goto out; + } + } + if (path) { + name = d_path(path, name_buf, name_buf_sz); + if (IS_ERR(name)) { + err = PTR_ERR(name); + goto out; + } + name_sz = name_buf + name_buf_sz - name; + } else if (name || name_fmt) { + name_sz = 1 + snprintf(name_buf, name_buf_sz, name_fmt ?: "%s", name); + name = name_buf; + } + if (name_sz > name_buf_sz) { + err = -ENAMETOOLONG; + goto out; + } + karg.vma_name_size = name_sz; + } + + /* unlock vma or mmap_lock, and put mm_struct before copying data to user */ + query_vma_teardown(&lock_ctx); + mmput(mm); + + if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr), + name, karg.vma_name_size)) { + kfree(name_buf); + return -EFAULT; + } + kfree(name_buf); + + if (karg.build_id_size && copy_to_user(u64_to_user_ptr(karg.build_id_addr), + build_id_buf, karg.build_id_size)) + return -EFAULT; + + if (copy_to_user(uarg, &karg, min_t(size_t, sizeof(karg), usize))) + return -EFAULT; + + return 0; + +out: + query_vma_teardown(&lock_ctx); + mmput(mm); + kfree(name_buf); + return err; +} + +static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct seq_file *seq = file->private_data; + struct proc_maps_private *priv = seq->private; + + switch (cmd) { + case PROCMAP_QUERY: + /* priv->lock_ctx.mm is set during file open operation */ + return do_procmap_query(priv->lock_ctx.mm, (void __user *)arg); + default: + return -ENOIOCTLCMD; + } +} + const struct file_operations proc_pid_maps_operations = { .open = pid_maps_open, .read = seq_read, .llseek = seq_lseek, .release = proc_map_release, + .unlocked_ioctl = procfs_procmap_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; /* @@ -412,6 +860,7 @@ struct mem_size_stats { unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; + unsigned long ksm; u64 pss; u64 pss_anon; u64 pss_file; @@ -422,14 +871,14 @@ struct mem_size_stats { }; static void smaps_page_accumulate(struct mem_size_stats *mss, - struct page *page, unsigned long size, unsigned long pss, + struct folio *folio, unsigned long size, unsigned long pss, bool dirty, bool locked, bool private) { mss->pss += pss; - if (PageAnon(page)) + if (folio_test_anon(folio)) mss->pss_anon += pss; - else if (PageSwapBacked(page)) + else if (folio_test_swapbacked(folio)) mss->pss_shmem += pss; else mss->pss_file += pss; @@ -437,7 +886,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, if (locked) mss->pss_locked += pss; - if (dirty || PageDirty(page)) { + if (dirty || folio_test_dirty(folio)) { mss->pss_dirty += pss; if (private) mss->private_dirty += size; @@ -453,53 +902,76 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty, bool locked, - bool migration) + bool present) { + struct folio *folio = page_folio(page); int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; + bool exclusive; + int mapcount; /* * First accumulate quantities that depend only on |size| and the type * of the compound page. */ - if (PageAnon(page)) { + if (folio_test_anon(folio)) { mss->anonymous += size; - if (!PageSwapBacked(page) && !dirty && !PageDirty(page)) + if (!folio_test_swapbacked(folio) && !dirty && + !folio_test_dirty(folio)) mss->lazyfree += size; } + if (folio_test_ksm(folio)) + mss->ksm += size; + mss->resident += size; /* Accumulate the size in pages that have been accessed. */ - if (young || page_is_young(page) || PageReferenced(page)) + if (young || folio_test_young(folio) || folio_test_referenced(folio)) mss->referenced += size; /* * Then accumulate quantities that may depend on sharing, or that may * differ page-by-page. * - * page_count(page) == 1 guarantees the page is mapped exactly once. - * If any subpage of the compound page mapped with PTE it would elevate - * page_count(). + * refcount == 1 for present entries guarantees that the folio is mapped + * exactly once. For large folios this implies that exactly one + * PTE/PMD/... maps (a part of) this folio. * - * The page_mapcount() is called to get a snapshot of the mapcount. - * Without holding the page lock this snapshot can be slightly wrong as - * we cannot always read the mapcount atomically. It is not safe to - * call page_mapcount() even with PTL held if the page is not mapped, - * especially for migration entries. Treat regular migration entries - * as mapcount == 1. + * Treat all non-present entries (where relying on the mapcount and + * refcount doesn't make sense) as "maybe shared, but not sure how + * often". We treat device private entries as being fake-present. + * + * Note that it would not be safe to read the mapcount especially for + * pages referenced by migration entries, even with the PTL held. */ - if ((page_count(page) == 1) || migration) { - smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty, - locked, true); + if (folio_ref_count(folio) == 1 || !present) { + smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT, + dirty, locked, present); return; } + + if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { + mapcount = folio_average_page_mapcount(folio); + exclusive = !folio_maybe_mapped_shared(folio); + } + + /* + * We obtain a snapshot of the mapcount. Without holding the folio lock + * this snapshot can be slightly wrong as we cannot always read the + * mapcount atomically. + */ for (i = 0; i < nr; i++, page++) { - int mapcount = page_mapcount(page); unsigned long pss = PAGE_SIZE << PSS_SHIFT; + + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) { + mapcount = folio_precise_page_mapcount(folio, page); + exclusive = mapcount < 2; + } + if (mapcount >= 2) pss /= mapcount; - smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked, - mapcount < 2); + smaps_page_accumulate(mss, folio, PAGE_SIZE, pss, + dirty, locked, exclusive); } } @@ -537,20 +1009,24 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; - bool migration = false, young = false, dirty = false; + bool present = false, young = false, dirty = false; + pte_t ptent = ptep_get(pte); - if (pte_present(*pte)) { - page = vm_normal_page(vma, addr, *pte); - young = pte_young(*pte); - dirty = pte_dirty(*pte); - } else if (is_swap_pte(*pte)) { - swp_entry_t swpent = pte_to_swp_entry(*pte); + if (pte_present(ptent)) { + page = vm_normal_page(vma, addr, ptent); + young = pte_young(ptent); + dirty = pte_dirty(ptent); + present = true; + } else if (pte_none(ptent)) { + smaps_pte_hole_lookup(addr, walk); + } else { + const softleaf_t entry = softleaf_from_pte(ptent); - if (!non_swap_entry(swpent)) { + if (softleaf_is_swap(entry)) { int mapcount; mss->swap += PAGE_SIZE; - mapcount = swp_swapcount(swpent); + mapcount = swp_swapcount(entry); if (mapcount >= 2) { u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; @@ -559,20 +1035,17 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else { mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; } - } else if (is_pfn_swap_entry(swpent)) { - if (is_migration_entry(swpent)) - migration = true; - page = pfn_swap_entry_to_page(swpent); + } else if (softleaf_has_pfn(entry)) { + if (softleaf_is_device_private(entry)) + present = true; + page = softleaf_to_page(entry); } - } else { - smaps_pte_hole_lookup(addr, walk); - return; } if (!page) return; - smaps_account(mss, page, false, young, dirty, locked, migration); + smaps_account(mss, page, false, young, dirty, locked, present); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -583,32 +1056,34 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; - bool migration = false; + bool present = false; + struct folio *folio; + if (pmd_none(*pmd)) + return; if (pmd_present(*pmd)) { - /* FOLL_DUMP will return -EFAULT on huge zero page */ - page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); - } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { - swp_entry_t entry = pmd_to_swp_entry(*pmd); - - if (is_migration_entry(entry)) { - migration = true; - page = pfn_swap_entry_to_page(entry); - } + page = vm_normal_page_pmd(vma, addr, *pmd); + present = true; + } else if (unlikely(thp_migration_supported())) { + const softleaf_t entry = softleaf_from_pmd(*pmd); + + if (softleaf_has_pfn(entry)) + page = softleaf_to_page(entry); } if (IS_ERR_OR_NULL(page)) return; - if (PageAnon(page)) + folio = page_folio(page); + if (folio_test_anon(folio)) mss->anonymous_thp += HPAGE_PMD_SIZE; - else if (PageSwapBacked(page)) + else if (folio_test_swapbacked(folio)) mss->shmem_thp += HPAGE_PMD_SIZE; - else if (is_zone_device_page(page)) + else if (folio_is_zone_device(folio)) /* pass */; else mss->file_thp += HPAGE_PMD_SIZE; smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), - locked, migration); + locked, present); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, @@ -631,14 +1106,11 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, goto out; } - if (pmd_trans_unstable(pmd)) - goto out; - /* - * The mmap_lock held all the way back in m_start() is what - * keeps khugepaged out of here and from collapsing things - * in here. - */ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return 0; + } for (; addr != end; pte++, addr += PAGE_SIZE) smaps_pte_entry(pte, addr, walk); pte_unmap_unlock(pte - 1, ptl); @@ -651,8 +1123,15 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) { /* * Don't forget to update Documentation/ on changes. + * + * The length of the second argument of mnemonics[] + * needs to be 3 instead of previously set 2 + * (i.e. from [BITS_PER_LONG][2] to [BITS_PER_LONG][3]) + * to avoid spurious + * -Werror=unterminated-string-initialization warning + * with GCC 15 */ - static const char mnemonics[BITS_PER_LONG][2] = { + static const char mnemonics[BITS_PER_LONG][3] = { /* * In case if we meet a flag we don't know about. */ @@ -668,6 +1147,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_MAYSHARE)] = "ms", [ilog2(VM_GROWSDOWN)] = "gd", [ilog2(VM_PFNMAP)] = "pf", + [ilog2(VM_MAYBE_GUARD)] = "gu", [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", @@ -703,14 +1183,25 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_PKEY_BIT0)] = "", [ilog2(VM_PKEY_BIT1)] = "", [ilog2(VM_PKEY_BIT2)] = "", +#if CONFIG_ARCH_PKEY_BITS > 3 [ilog2(VM_PKEY_BIT3)] = "", -#if VM_PKEY_BIT4 +#endif +#if CONFIG_ARCH_PKEY_BITS > 4 [ilog2(VM_PKEY_BIT4)] = "", #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR [ilog2(VM_UFFD_MINOR)] = "ui", #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK + [ilog2(VM_SHADOW_STACK)] = "ss", +#endif +#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) + [ilog2(VM_DROPPABLE)] = "dp", +#endif +#ifdef CONFIG_64BIT + [ilog2(VM_SEALED)] = "sl", +#endif }; size_t i; @@ -718,11 +1209,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) for (i = 0; i < BITS_PER_LONG; i++) { if (!mnemonics[i][0]) continue; - if (vma->vm_flags & (1UL << i)) { - seq_putc(m, mnemonics[i][0]); - seq_putc(m, mnemonics[i][1]); - seq_putc(m, ' '); - } + if (vma->vm_flags & (1UL << i)) + seq_printf(m, "%s ", mnemonics[i]); } seq_putc(m, '\n'); } @@ -734,24 +1222,32 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; - struct page *page = NULL; + struct folio *folio = NULL; + bool present = false; + spinlock_t *ptl; + pte_t ptent; - if (pte_present(*pte)) { - page = vm_normal_page(vma, addr, *pte); - } else if (is_swap_pte(*pte)) { - swp_entry_t swpent = pte_to_swp_entry(*pte); + ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); + ptent = huge_ptep_get(walk->mm, addr, pte); + if (pte_present(ptent)) { + folio = page_folio(pte_page(ptent)); + present = true; + } else { + const softleaf_t entry = softleaf_from_pte(ptent); - if (is_pfn_swap_entry(swpent)) - page = pfn_swap_entry_to_page(swpent); + if (softleaf_has_pfn(entry)) + folio = softleaf_to_folio(entry); } - if (page) { - int mapcount = page_mapcount(page); - if (mapcount >= 2) + if (folio) { + /* We treat non-present entries as "maybe shared". */ + if (!present || folio_maybe_mapped_shared(folio) || + hugetlb_pmd_shared(pte)) mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); else mss->private_hugetlb += huge_page_size(hstate_vma(vma)); } + spin_unlock(ptl); return 0; } #else @@ -761,12 +1257,14 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, static const struct mm_walk_ops smaps_walk_ops = { .pmd_entry = smaps_pte_range, .hugetlb_entry = smaps_hugetlb_range, + .walk_lock = PGWALK_RDLOCK, }; static const struct mm_walk_ops smaps_shmem_walk_ops = { .pmd_entry = smaps_pte_range, .hugetlb_entry = smaps_hugetlb_range, .pte_hole = smaps_pte_hole, + .walk_lock = PGWALK_RDLOCK, }; /* @@ -784,7 +1282,6 @@ static void smap_gather_stats(struct vm_area_struct *vma, if (start >= vma->vm_end) return; -#ifdef CONFIG_SHMEM if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all @@ -805,7 +1302,7 @@ static void smap_gather_stats(struct vm_area_struct *vma, ops = &smaps_shmem_walk_ops; } } -#endif + /* mmap_lock is held in m_start */ if (!start) walk_page_vma(vma, ops, mss); @@ -841,6 +1338,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); + SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm); SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); @@ -859,9 +1357,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, static int show_smap(struct seq_file *m, void *v) { struct vm_area_struct *vma = v; - struct mem_size_stats mss; - - memset(&mss, 0, sizeof(mss)); + struct mem_size_stats mss = {}; smap_gather_stats(vma, &mss, 0); @@ -874,8 +1370,9 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); - seq_printf(m, "THPeligible: %d\n", - hugepage_vma_check(vma, vma->vm_flags, true, false, true)); + seq_printf(m, "THPeligible: %8u\n", + !!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS, + THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); @@ -887,12 +1384,12 @@ static int show_smap(struct seq_file *m, void *v) static int show_smaps_rollup(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; - struct mem_size_stats mss; - struct mm_struct *mm = priv->mm; + struct mem_size_stats mss = {}; + struct mm_struct *mm = priv->lock_ctx.mm; struct vm_area_struct *vma; unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); priv->task = get_proc_task(priv->inode); if (!priv->task) @@ -903,14 +1400,12 @@ static int show_smaps_rollup(struct seq_file *m, void *v) goto out_put_task; } - memset(&mss, 0, sizeof(mss)); - ret = mmap_read_lock_killable(mm); if (ret) goto out_put_mm; hold_task_mempolicy(priv); - vma = mas_find(&mas, ULONG_MAX); + vma = vma_next(&vmi); if (unlikely(!vma)) goto empty_set; @@ -925,7 +1420,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * access it for write request. */ if (mmap_lock_is_contended(mm)) { - mas_pause(&mas); + vma_iter_invalidate(&vmi); mmap_read_unlock(mm); ret = mmap_read_lock_killable(mm); if (ret) { @@ -950,40 +1445,44 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * * 1) VMA2 is freed, but VMA3 exists: * - * find_vma(mm, 16k - 1) will return VMA3. + * vma_next(vmi) will return VMA3. * In this case, just continue from VMA3. * * 2) VMA2 still exists: * - * find_vma(mm, 16k - 1) will return VMA2. - * Iterate the loop like the original one. + * vma_next(vmi) will return VMA3. + * In this case, just continue from VMA3. * * 3) No more VMAs can be found: * - * find_vma(mm, 16k - 1) will return NULL. + * vma_next(vmi) will return NULL. * No more things to do, just break. * * 4) (last_vma_end - 1) is the middle of a vma (VMA'): * - * find_vma(mm, 16k - 1) will return VMA' whose range + * vma_next(vmi) will return VMA' whose range * contains last_vma_end. * Iterate VMA' from last_vma_end. */ - vma = mas_find(&mas, ULONG_MAX); + vma = vma_next(&vmi); /* Case 3 above */ if (!vma) break; - /* Case 1 above */ - if (vma->vm_start >= last_vma_end) + /* Case 1 and 2 above */ + if (vma->vm_start >= last_vma_end) { + smap_gather_stats(vma, &mss, 0); + last_vma_end = vma->vm_end; continue; + } /* Case 4 above */ - if (vma->vm_end > last_vma_end) + if (vma->vm_end > last_vma_end) { smap_gather_stats(vma, &mss, last_vma_end); + last_vma_end = vma->vm_end; + } } - /* Case 2 above */ - } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + } for_each_vma(vmi, vma); empty_set: show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); @@ -1031,9 +1530,9 @@ static int smaps_rollup_open(struct inode *inode, struct file *file) goto out_free; priv->inode = inode; - priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - ret = PTR_ERR(priv->mm); + priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) { + ret = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH; single_release(inode, file); goto out_free; @@ -1051,8 +1550,8 @@ static int smaps_rollup_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; - if (priv->mm) - mmdrop(priv->mm); + if (priv->lock_ctx.mm) + mmdrop(priv->lock_ctx.mm); kfree(priv); return single_release(inode, file); @@ -1085,34 +1584,37 @@ struct clear_refs_private { enum clear_refs_types type; }; -#ifdef CONFIG_MEM_SOFT_DIRTY - static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - struct page *page; + struct folio *folio; if (!pte_write(pte)) return false; if (!is_cow_mapping(vma->vm_flags)) return false; - if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))) + if (likely(!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm))) return false; - page = vm_normal_page(vma, addr, pte); - if (!page) + folio = vm_normal_folio(vma, addr, pte); + if (!folio) return false; - return page_maybe_dma_pinned(page); + return folio_maybe_dma_pinned(folio); } static inline void clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { + if (!pgtable_supports_soft_dirty()) + return; /* * The soft-dirty tracker uses #PF-s to catch writes * to pages, so write-protect the pte as well. See the * Documentation/admin-guide/mm/soft-dirty.rst for full description * of how soft-dirty works. */ - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); + + if (pte_none(ptent)) + return; if (pte_present(ptent)) { pte_t old_pte; @@ -1123,24 +1625,21 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, ptent = pte_wrprotect(old_pte); ptent = pte_clear_soft_dirty(ptent); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); - } else if (is_swap_pte(ptent)) { + } else { ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); } } -#else -static inline void clear_soft_dirty(struct vm_area_struct *vma, - unsigned long addr, pte_t *pte) -{ -} -#endif -#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t old, pmd = *pmdp; + if (!pgtable_supports_soft_dirty()) + return; + if (pmd_present(pmd)) { /* See comment in change_huge_pmd() */ old = pmdp_invalidate(vma, addr, pmdp); @@ -1153,7 +1652,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, pmd = pmd_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); - } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + } else if (pmd_is_migration_entry(pmd)) { pmd = pmd_swp_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } @@ -1172,7 +1671,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; pte_t *pte, ptent; spinlock_t *ptl; - struct page *page; + struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -1184,23 +1683,24 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, if (!pmd_present(*pmd)) goto out; - page = pmd_page(*pmd); + folio = pmd_folio(*pmd); /* Clear accessed and referenced bits. */ pmdp_test_and_clear_young(vma, addr, pmd); - test_and_clear_page_young(page); - ClearPageReferenced(page); + folio_test_clear_young(folio); + folio_clear_referenced(folio); out: spin_unlock(ptl); return 0; } - if (pmd_trans_unstable(pmd)) - return 0; - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return 0; + } for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + ptent = ptep_get(pte); if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); @@ -1210,14 +1710,14 @@ out: if (!pte_present(ptent)) continue; - page = vm_normal_page(vma, addr, ptent); - if (!page) + folio = vm_normal_folio(vma, addr, ptent); + if (!folio) continue; /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); - test_and_clear_page_young(page); - ClearPageReferenced(page); + folio_test_clear_young(folio); + folio_clear_referenced(folio); } pte_unmap_unlock(pte - 1, ptl); cond_resched(); @@ -1249,20 +1749,20 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end, static const struct mm_walk_ops clear_refs_walk_ops = { .pmd_entry = clear_refs_pte_range, .test_walk = clear_refs_test_walk, + .walk_lock = PGWALK_WRLOCK, }; static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; struct mm_struct *mm; struct vm_area_struct *vma; enum clear_refs_types type; int itype; int rv; - memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) @@ -1279,7 +1779,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, @@ -1299,16 +1799,16 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } if (type == CLEAR_REFS_SOFT_DIRTY) { - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; - vma->vm_flags &= ~VM_SOFTDIRTY; + vm_flags_clear(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); } inc_tlb_flush_pending(mm); mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, - 0, NULL, mm, 0, -1UL); + 0, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); @@ -1351,6 +1851,7 @@ struct pagemapread { #define PM_SOFT_DIRTY BIT_ULL(55) #define PM_MMAP_EXCLUSIVE BIT_ULL(56) #define PM_UFFD_WP BIT_ULL(57) +#define PM_GUARD_REGION BIT_ULL(58) #define PM_FILE BIT_ULL(61) #define PM_SWAP BIT_ULL(62) #define PM_PRESENT BIT_ULL(63) @@ -1362,8 +1863,7 @@ static inline pagemap_entry_t make_pme(u64 frame, u64 flags) return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; } -static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, - struct pagemapread *pm) +static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) { pm->buffer[pm->pos++] = *pme; if (pm->pos >= pm->len) @@ -1371,6 +1871,13 @@ static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, return 0; } +static bool __folio_page_mapped_exclusively(struct folio *folio, struct page *page) +{ + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + return folio_precise_page_mapcount(folio, page) == 1; + return !folio_maybe_mapped_shared(folio); +} + static int pagemap_pte_hole(unsigned long start, unsigned long end, __always_unused int depth, struct mm_walk *walk) { @@ -1390,7 +1897,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, hole_end = end; for (; addr < hole_end; addr += PAGE_SIZE) { - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) goto out; } @@ -1402,7 +1909,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, if (vma->vm_flags & VM_SOFTDIRTY) pme = make_pme(0, PM_SOFT_DIRTY); for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) goto out; } @@ -1416,7 +1923,10 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, { u64 frame = 0, flags = 0; struct page *page = NULL; - bool migration = false; + struct folio *folio; + + if (pte_none(pte)) + goto out; if (pte_present(pte)) { if (pm->show_pfn) @@ -1427,136 +1937,164 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, flags |= PM_SOFT_DIRTY; if (pte_uffd_wp(pte)) flags |= PM_UFFD_WP; - } else if (is_swap_pte(pte)) { - swp_entry_t entry; + } else { + softleaf_t entry; + if (pte_swp_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; if (pte_swp_uffd_wp(pte)) flags |= PM_UFFD_WP; - entry = pte_to_swp_entry(pte); + entry = softleaf_from_pte(pte); if (pm->show_pfn) { pgoff_t offset; + /* * For PFN swap offsets, keeping the offset field * to be PFN only to be compatible with old smaps. */ - if (is_pfn_swap_entry(entry)) - offset = swp_offset_pfn(entry); + if (softleaf_has_pfn(entry)) + offset = softleaf_to_pfn(entry); else offset = swp_offset(entry); frame = swp_type(entry) | (offset << MAX_SWAPFILES_SHIFT); } flags |= PM_SWAP; - migration = is_migration_entry(entry); - if (is_pfn_swap_entry(entry)) - page = pfn_swap_entry_to_page(entry); - if (pte_marker_entry_uffd_wp(entry)) + if (softleaf_has_pfn(entry)) + page = softleaf_to_page(entry); + if (softleaf_is_uffd_wp_marker(entry)) flags |= PM_UFFD_WP; + if (softleaf_is_guard_marker(entry)) + flags |= PM_GUARD_REGION; + } + + if (page) { + folio = page_folio(page); + if (!folio_test_anon(folio)) + flags |= PM_FILE; + if ((flags & PM_PRESENT) && + __folio_page_mapped_exclusively(folio, page)) + flags |= PM_MMAP_EXCLUSIVE; } - if (page && !PageAnon(page)) - flags |= PM_FILE; - if (page && !migration && page_mapcount(page) == 1) - flags |= PM_MMAP_EXCLUSIVE; +out: if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; return make_pme(frame, flags); } -static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, - struct mm_walk *walk) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, + unsigned long end, struct vm_area_struct *vma, + struct pagemapread *pm) { - struct vm_area_struct *vma = walk->vma; - struct pagemapread *pm = walk->private; - spinlock_t *ptl; - pte_t *pte, *orig_pte; + unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT; + u64 flags = 0, frame = 0; + pmd_t pmd = *pmdp; + struct page *page = NULL; + struct folio *folio = NULL; int err = 0; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - bool migration = false; - ptl = pmd_trans_huge_lock(pmdp, vma); - if (ptl) { - u64 flags = 0, frame = 0; - pmd_t pmd = *pmdp; - struct page *page = NULL; + if (vma->vm_flags & VM_SOFTDIRTY) + flags |= PM_SOFT_DIRTY; - if (vma->vm_flags & VM_SOFTDIRTY) + if (pmd_none(pmd)) + goto populate_pagemap; + + if (pmd_present(pmd)) { + page = pmd_page(pmd); + + flags |= PM_PRESENT; + if (pmd_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; + if (pmd_uffd_wp(pmd)) + flags |= PM_UFFD_WP; + if (pm->show_pfn) + frame = pmd_pfn(pmd) + idx; + } else if (thp_migration_supported()) { + const softleaf_t entry = softleaf_from_pmd(pmd); + unsigned long offset; - if (pmd_present(pmd)) { - page = pmd_page(pmd); - - flags |= PM_PRESENT; - if (pmd_soft_dirty(pmd)) - flags |= PM_SOFT_DIRTY; - if (pmd_uffd_wp(pmd)) - flags |= PM_UFFD_WP; - if (pm->show_pfn) - frame = pmd_pfn(pmd) + - ((addr & ~PMD_MASK) >> PAGE_SHIFT); - } -#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION - else if (is_swap_pmd(pmd)) { - swp_entry_t entry = pmd_to_swp_entry(pmd); - unsigned long offset; - - if (pm->show_pfn) { - if (is_pfn_swap_entry(entry)) - offset = swp_offset_pfn(entry); - else - offset = swp_offset(entry); - offset = offset + - ((addr & ~PMD_MASK) >> PAGE_SHIFT); - frame = swp_type(entry) | - (offset << MAX_SWAPFILES_SHIFT); - } - flags |= PM_SWAP; - if (pmd_swp_soft_dirty(pmd)) - flags |= PM_SOFT_DIRTY; - if (pmd_swp_uffd_wp(pmd)) - flags |= PM_UFFD_WP; - VM_BUG_ON(!is_pmd_migration_entry(pmd)); - migration = is_migration_entry(entry); - page = pfn_swap_entry_to_page(entry); + if (pm->show_pfn) { + if (softleaf_has_pfn(entry)) + offset = softleaf_to_pfn(entry) + idx; + else + offset = swp_offset(entry) + idx; + frame = swp_type(entry) | + (offset << MAX_SWAPFILES_SHIFT); } -#endif + flags |= PM_SWAP; + if (pmd_swp_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; + if (pmd_swp_uffd_wp(pmd)) + flags |= PM_UFFD_WP; + VM_WARN_ON_ONCE(!pmd_is_migration_entry(pmd)); + page = softleaf_to_page(entry); + } - if (page && !migration && page_mapcount(page) == 1) - flags |= PM_MMAP_EXCLUSIVE; + if (page) { + folio = page_folio(page); + if (!folio_test_anon(folio)) + flags |= PM_FILE; + } - for (; addr != end; addr += PAGE_SIZE) { - pagemap_entry_t pme = make_pme(frame, flags); +populate_pagemap: + for (; addr != end; addr += PAGE_SIZE, idx++) { + u64 cur_flags = flags; + pagemap_entry_t pme; - err = add_to_pagemap(addr, &pme, pm); - if (err) - break; - if (pm->show_pfn) { - if (flags & PM_PRESENT) - frame++; - else if (flags & PM_SWAP) - frame += (1 << MAX_SWAPFILES_SHIFT); - } + if (folio && (flags & PM_PRESENT) && + __folio_page_mapped_exclusively(folio, page)) + cur_flags |= PM_MMAP_EXCLUSIVE; + + pme = make_pme(frame, cur_flags); + err = add_to_pagemap(&pme, pm); + if (err) + break; + if (pm->show_pfn) { + if (flags & PM_PRESENT) + frame++; + else if (flags & PM_SWAP) + frame += (1 << MAX_SWAPFILES_SHIFT); } + } + return err; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct pagemapread *pm = walk->private; + spinlock_t *ptl; + pte_t *pte, *orig_pte; + int err = 0; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ptl = pmd_trans_huge_lock(pmdp, vma); + if (ptl) { + err = pagemap_pmd_range_thp(pmdp, addr, end, vma, pm); spin_unlock(ptl); return err; } - - if (pmd_trans_unstable(pmdp)) - return 0; -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif /* * We can assume that @vma always points to a valid one and @end never * goes beyond vma->vm_end. */ orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return err; + } for (; addr < end; pte++, addr += PAGE_SIZE) { pagemap_entry_t pme; - pme = pte_to_pagemap_entry(pm, vma, addr, *pte); - err = add_to_pagemap(addr, &pme, pm); + pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); + err = add_to_pagemap(&pme, pm); if (err) break; } @@ -1576,20 +2114,23 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, struct pagemapread *pm = walk->private; struct vm_area_struct *vma = walk->vma; u64 flags = 0, frame = 0; + spinlock_t *ptl; int err = 0; pte_t pte; if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; - pte = huge_ptep_get(ptep); + ptl = huge_pte_lock(hstate_vma(vma), walk->mm, ptep); + pte = huge_ptep_get(walk->mm, addr, ptep); if (pte_present(pte)) { - struct page *page = pte_page(pte); + struct folio *folio = page_folio(pte_page(pte)); - if (!PageAnon(page)) + if (!folio_test_anon(folio)) flags |= PM_FILE; - if (page_mapcount(page) == 1) + if (!folio_maybe_mapped_shared(folio) && + !hugetlb_pmd_shared(ptep)) flags |= PM_MMAP_EXCLUSIVE; if (huge_pte_uffd_wp(pte)) @@ -1606,13 +2147,14 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, for (; addr != end; addr += PAGE_SIZE) { pagemap_entry_t pme = make_pme(frame, flags); - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) - return err; + break; if (pm->show_pfn && (flags & PM_PRESENT)) frame++; } + spin_unlock(ptl); cond_resched(); return err; @@ -1625,6 +2167,7 @@ static const struct mm_walk_ops pagemap_ops = { .pmd_entry = pagemap_pmd_range, .pte_hole = pagemap_pte_hole, .hugetlb_entry = pagemap_hugetlb_range, + .walk_lock = PGWALK_RDLOCK, }; /* @@ -1639,7 +2182,8 @@ static const struct mm_walk_ops pagemap_ops = { * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) * Bit 56 page exclusively mapped * Bit 57 pte is uffd-wp write-protected - * Bits 58-60 zero + * Bit 58 pte is a guard region + * Bits 59-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present @@ -1692,19 +2236,24 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, /* watch out for wraparound */ start_vaddr = end_vaddr; - if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) - start_vaddr = untagged_addr(svpfn << PAGE_SHIFT); + if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { + unsigned long end; + + ret = mmap_read_lock_killable(mm); + if (ret) + goto out_free; + start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT); + mmap_read_unlock(mm); + + end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT); + if (end >= start_vaddr && end < mm->task_size) + end_vaddr = end; + } /* Ensure the address is inside the task */ if (start_vaddr > mm->task_size) start_vaddr = end_vaddr; - /* - * The odds are that this will stop walking way - * before end_vaddr, because the length of the - * user buffer is tracked in "pm", and the walk - * will stop when we hit the end of the buffer. - */ ret = 0; while (count && (start_vaddr < end_vaddr)) { int len; @@ -1748,8 +2297,8 @@ static int pagemap_open(struct inode *inode, struct file *file) struct mm_struct *mm; mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(mm)) - return PTR_ERR(mm); + if (IS_ERR_OR_NULL(mm)) + return mm ? PTR_ERR(mm) : -ESRCH; file->private_data = mm; return 0; } @@ -1763,11 +2312,794 @@ static int pagemap_release(struct inode *inode, struct file *file) return 0; } +#define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ + PAGE_IS_FILE | PAGE_IS_PRESENT | \ + PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ + PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \ + PAGE_IS_GUARD) +#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) + +struct pagemap_scan_private { + struct pm_scan_arg arg; + unsigned long masks_of_interest, cur_vma_category; + struct page_region *vec_buf; + unsigned long vec_buf_len, vec_buf_index, found_pages; + struct page_region __user *vec_out; +}; + +static unsigned long pagemap_page_category(struct pagemap_scan_private *p, + struct vm_area_struct *vma, + unsigned long addr, pte_t pte) +{ + unsigned long categories; + + if (pte_none(pte)) + return 0; + + if (pte_present(pte)) { + struct page *page; + + categories = PAGE_IS_PRESENT; + + if (!pte_uffd_wp(pte)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + page = vm_normal_page(vma, addr, pte); + if (page && !PageAnon(page)) + categories |= PAGE_IS_FILE; + } + + if (is_zero_pfn(pte_pfn(pte))) + categories |= PAGE_IS_PFNZERO; + if (pte_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; + } else { + softleaf_t entry; + + categories = PAGE_IS_SWAPPED; + + if (!pte_swp_uffd_wp_any(pte)) + categories |= PAGE_IS_WRITTEN; + + entry = softleaf_from_pte(pte); + if (softleaf_is_guard_marker(entry)) + categories |= PAGE_IS_GUARD; + else if ((p->masks_of_interest & PAGE_IS_FILE) && + softleaf_has_pfn(entry) && + !folio_test_anon(softleaf_to_folio(entry))) + categories |= PAGE_IS_FILE; + + if (pte_swp_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; + } + + return categories; +} + +static void make_uffd_wp_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte, pte_t ptent) +{ + if (pte_present(ptent)) { + pte_t old_pte; + + old_pte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_mkuffd_wp(old_pte); + ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); + } else if (pte_none(ptent)) { + set_pte_at(vma->vm_mm, addr, pte, + make_pte_marker(PTE_MARKER_UFFD_WP)); + } else { + ptent = pte_swp_mkuffd_wp(ptent); + set_pte_at(vma->vm_mm, addr, pte, ptent); + } +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, + struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd) +{ + unsigned long categories = PAGE_IS_HUGE; + + if (pmd_none(pmd)) + return categories; + + if (pmd_present(pmd)) { + struct page *page; + + categories |= PAGE_IS_PRESENT; + if (!pmd_uffd_wp(pmd)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + page = vm_normal_page_pmd(vma, addr, pmd); + if (page && !PageAnon(page)) + categories |= PAGE_IS_FILE; + } + + if (is_huge_zero_pmd(pmd)) + categories |= PAGE_IS_PFNZERO; + if (pmd_soft_dirty(pmd)) + categories |= PAGE_IS_SOFT_DIRTY; + } else { + categories |= PAGE_IS_SWAPPED; + if (!pmd_swp_uffd_wp(pmd)) + categories |= PAGE_IS_WRITTEN; + if (pmd_swp_soft_dirty(pmd)) + categories |= PAGE_IS_SOFT_DIRTY; + + if (p->masks_of_interest & PAGE_IS_FILE) { + const softleaf_t entry = softleaf_from_pmd(pmd); + + if (softleaf_has_pfn(entry) && + !folio_test_anon(softleaf_to_folio(entry))) + categories |= PAGE_IS_FILE; + } + } + + return categories; +} + +static void make_uffd_wp_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old, pmd = *pmdp; + + if (pmd_present(pmd)) { + old = pmdp_invalidate_ad(vma, addr, pmdp); + pmd = pmd_mkuffd_wp(old); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } else if (pmd_is_migration_entry(pmd)) { + pmd = pmd_swp_mkuffd_wp(pmd); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_HUGETLB_PAGE +static unsigned long pagemap_hugetlb_category(pte_t pte) +{ + unsigned long categories = PAGE_IS_HUGE; + + if (pte_none(pte)) + return categories; + + /* + * According to pagemap_hugetlb_range(), file-backed HugeTLB + * page cannot be swapped. So PAGE_IS_FILE is not checked for + * swapped pages. + */ + if (pte_present(pte)) { + categories |= PAGE_IS_PRESENT; + + if (!huge_pte_uffd_wp(pte)) + categories |= PAGE_IS_WRITTEN; + if (!PageAnon(pte_page(pte))) + categories |= PAGE_IS_FILE; + if (is_zero_pfn(pte_pfn(pte))) + categories |= PAGE_IS_PFNZERO; + if (pte_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; + } else { + categories |= PAGE_IS_SWAPPED; + + if (!pte_swp_uffd_wp_any(pte)) + categories |= PAGE_IS_WRITTEN; + if (pte_swp_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; + } + + return categories; +} + +static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t ptent) +{ + const unsigned long psize = huge_page_size(hstate_vma(vma)); + softleaf_t entry; + + if (huge_pte_none(ptent)) { + set_huge_pte_at(vma->vm_mm, addr, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP), psize); + return; + } + + entry = softleaf_from_pte(ptent); + if (softleaf_is_hwpoison(entry) || softleaf_is_marker(entry)) + return; + + if (softleaf_is_migration(entry)) + set_huge_pte_at(vma->vm_mm, addr, ptep, + pte_swp_mkuffd_wp(ptent), psize); + else + huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, + huge_pte_mkuffd_wp(ptent)); +} +#endif /* CONFIG_HUGETLB_PAGE */ + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) +static void pagemap_scan_backout_range(struct pagemap_scan_private *p, + unsigned long addr, unsigned long end) +{ + struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; + + if (!p->vec_buf) + return; + + if (cur_buf->start != addr) + cur_buf->end = addr; + else + cur_buf->start = cur_buf->end = 0; + + p->found_pages -= (end - addr) / PAGE_SIZE; +} +#endif + +static bool pagemap_scan_is_interesting_page(unsigned long categories, + const struct pagemap_scan_private *p) +{ + categories ^= p->arg.category_inverted; + if ((categories & p->arg.category_mask) != p->arg.category_mask) + return false; + if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask)) + return false; + + return true; +} + +static bool pagemap_scan_is_interesting_vma(unsigned long categories, + const struct pagemap_scan_private *p) +{ + unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED; + + categories ^= p->arg.category_inverted; + if ((categories & required) != required) + return false; + + return true; +} + +static int pagemap_scan_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long vma_category = 0; + bool wp_allowed = userfaultfd_wp_async(vma) && + userfaultfd_wp_use_markers(vma); + + if (!wp_allowed) { + /* User requested explicit failure over wp-async capability */ + if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) + return -EPERM; + /* + * User requires wr-protect, and allows silently skipping + * unsupported vmas. + */ + if (p->arg.flags & PM_SCAN_WP_MATCHING) + return 1; + /* + * Then the request doesn't involve wr-protects at all, + * fall through to the rest checks, and allow vma walk. + */ + } + + if (vma->vm_flags & VM_PFNMAP) + return 1; + + if (wp_allowed) + vma_category |= PAGE_IS_WPALLOWED; + + if (vma->vm_flags & VM_SOFTDIRTY) + vma_category |= PAGE_IS_SOFT_DIRTY; + + if (!pagemap_scan_is_interesting_vma(vma_category, p)) + return 1; + + p->cur_vma_category = vma_category; + + return 0; +} + +static bool pagemap_scan_push_range(unsigned long categories, + struct pagemap_scan_private *p, + unsigned long addr, unsigned long end) +{ + struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; + + /* + * When there is no output buffer provided at all, the sentinel values + * won't match here. There is no other way for `cur_buf->end` to be + * non-zero other than it being non-empty. + */ + if (addr == cur_buf->end && categories == cur_buf->categories) { + cur_buf->end = end; + return true; + } + + if (cur_buf->end) { + if (p->vec_buf_index >= p->vec_buf_len - 1) + return false; + + cur_buf = &p->vec_buf[++p->vec_buf_index]; + } + + cur_buf->start = addr; + cur_buf->end = end; + cur_buf->categories = categories; + + return true; +} + +static int pagemap_scan_output(unsigned long categories, + struct pagemap_scan_private *p, + unsigned long addr, unsigned long *end) +{ + unsigned long n_pages, total_pages; + int ret = 0; + + if (!p->vec_buf) + return 0; + + categories &= p->arg.return_mask; + + n_pages = (*end - addr) / PAGE_SIZE; + if (check_add_overflow(p->found_pages, n_pages, &total_pages) || + total_pages > p->arg.max_pages) { + size_t n_too_much = total_pages - p->arg.max_pages; + *end -= n_too_much * PAGE_SIZE; + n_pages -= n_too_much; + ret = -ENOSPC; + } + + if (!pagemap_scan_push_range(categories, p, addr, *end)) { + *end = addr; + n_pages = 0; + ret = -ENOSPC; + } + + p->found_pages += n_pages; + if (ret) + p->arg.walk_end = *end; + + return ret; +} + +static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long categories; + spinlock_t *ptl; + int ret = 0; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return -ENOENT; + + categories = p->cur_vma_category | + pagemap_thp_category(p, vma, start, *pmd); + + if (!pagemap_scan_is_interesting_page(categories, p)) + goto out_unlock; + + ret = pagemap_scan_output(categories, p, start, &end); + if (start == end) + goto out_unlock; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + goto out_unlock; + if (~categories & PAGE_IS_WRITTEN) + goto out_unlock; + + /* + * Break huge page into small pages if the WP operation + * needs to be performed on a portion of the huge page. + */ + if (end != start + HPAGE_SIZE) { + spin_unlock(ptl); + split_huge_pmd(vma, pmd, start); + pagemap_scan_backout_range(p, start, end); + /* Report as if there was no THP */ + return -ENOENT; + } + + make_uffd_wp_pmd(vma, start, pmd); + flush_tlb_range(vma, start, end); +out_unlock: + spin_unlock(ptl); + return ret; +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ + return -ENOENT; +#endif +} + +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long addr, flush_end = 0; + pte_t *pte, *start_pte; + spinlock_t *ptl; + int ret; + + ret = pagemap_scan_thp_entry(pmd, start, end, walk); + if (ret != -ENOENT) + return ret; + + ret = 0; + start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return 0; + } + + arch_enter_lazy_mmu_mode(); + + if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { + /* Fast path for performing exclusive WP */ + for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { + pte_t ptent = ptep_get(pte); + + if ((pte_present(ptent) && pte_uffd_wp(ptent)) || + pte_swp_uffd_wp_any(ptent)) + continue; + make_uffd_wp_pte(vma, addr, pte, ptent); + if (!flush_end) + start = addr; + flush_end = addr + PAGE_SIZE; + } + goto flush_and_return; + } + + if (!p->arg.category_anyof_mask && !p->arg.category_inverted && + p->arg.category_mask == PAGE_IS_WRITTEN && + p->arg.return_mask == PAGE_IS_WRITTEN) { + for (addr = start; addr < end; pte++, addr += PAGE_SIZE) { + unsigned long next = addr + PAGE_SIZE; + pte_t ptent = ptep_get(pte); + + if ((pte_present(ptent) && pte_uffd_wp(ptent)) || + pte_swp_uffd_wp_any(ptent)) + continue; + ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN, + p, addr, &next); + if (next == addr) + break; + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + continue; + make_uffd_wp_pte(vma, addr, pte, ptent); + if (!flush_end) + start = addr; + flush_end = next; + } + goto flush_and_return; + } + + for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { + pte_t ptent = ptep_get(pte); + unsigned long categories = p->cur_vma_category | + pagemap_page_category(p, vma, addr, ptent); + unsigned long next = addr + PAGE_SIZE; + + if (!pagemap_scan_is_interesting_page(categories, p)) + continue; + + ret = pagemap_scan_output(categories, p, addr, &next); + if (next == addr) + break; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + continue; + if (~categories & PAGE_IS_WRITTEN) + continue; + + make_uffd_wp_pte(vma, addr, pte, ptent); + if (!flush_end) + start = addr; + flush_end = next; + } + +flush_and_return: + if (flush_end) + flush_tlb_range(vma, start, addr); + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(start_pte, ptl); + + cond_resched(); + return ret; +} + +#ifdef CONFIG_HUGETLB_PAGE +static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, + unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long categories; + spinlock_t *ptl; + int ret = 0; + pte_t pte; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) { + /* Go the short route when not write-protecting pages. */ + + pte = huge_ptep_get(walk->mm, start, ptep); + categories = p->cur_vma_category | pagemap_hugetlb_category(pte); + + if (!pagemap_scan_is_interesting_page(categories, p)) + return 0; + + return pagemap_scan_output(categories, p, start, &end); + } + + i_mmap_lock_write(vma->vm_file->f_mapping); + ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); + + pte = huge_ptep_get(walk->mm, start, ptep); + categories = p->cur_vma_category | pagemap_hugetlb_category(pte); + + if (!pagemap_scan_is_interesting_page(categories, p)) + goto out_unlock; + + ret = pagemap_scan_output(categories, p, start, &end); + if (start == end) + goto out_unlock; + + if (~categories & PAGE_IS_WRITTEN) + goto out_unlock; + + if (end != start + HPAGE_SIZE) { + /* Partial HugeTLB page WP isn't possible. */ + pagemap_scan_backout_range(p, start, end); + p->arg.walk_end = start; + ret = 0; + goto out_unlock; + } + + make_uffd_wp_huge_pte(vma, start, ptep, pte); + flush_hugetlb_tlb_range(vma, start, end); + +out_unlock: + spin_unlock(ptl); + i_mmap_unlock_write(vma->vm_file->f_mapping); + + return ret; +} +#else +#define pagemap_scan_hugetlb_entry NULL +#endif + +static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, + int depth, struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + int ret, err; + + if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p)) + return 0; + + ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end); + if (addr == end) + return ret; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + return ret; + + err = uffd_wp_range(vma, addr, end - addr, true); + if (err < 0) + ret = err; + + return ret; +} + +static const struct mm_walk_ops pagemap_scan_ops = { + .test_walk = pagemap_scan_test_walk, + .pmd_entry = pagemap_scan_pmd_entry, + .pte_hole = pagemap_scan_pte_hole, + .hugetlb_entry = pagemap_scan_hugetlb_entry, +}; + +static int pagemap_scan_get_args(struct pm_scan_arg *arg, + unsigned long uarg) +{ + if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg))) + return -EFAULT; + + if (arg->size != sizeof(struct pm_scan_arg)) + return -EINVAL; + + /* Validate requested features */ + if (arg->flags & ~PM_SCAN_FLAGS) + return -EINVAL; + if ((arg->category_inverted | arg->category_mask | + arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES) + return -EINVAL; + + arg->start = untagged_addr((unsigned long)arg->start); + arg->end = untagged_addr((unsigned long)arg->end); + arg->vec = untagged_addr((unsigned long)arg->vec); + + /* Validate memory pointers */ + if (!IS_ALIGNED(arg->start, PAGE_SIZE)) + return -EINVAL; + if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start)) + return -EFAULT; + if (!arg->vec && arg->vec_len) + return -EINVAL; + if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX) + return -EINVAL; + if (arg->vec && !access_ok((void __user *)(long)arg->vec, + size_mul(arg->vec_len, sizeof(struct page_region)))) + return -EFAULT; + + /* Fixup default values */ + arg->end = ALIGN(arg->end, PAGE_SIZE); + arg->walk_end = 0; + if (!arg->max_pages) + arg->max_pages = ULONG_MAX; + + return 0; +} + +static int pagemap_scan_writeback_args(struct pm_scan_arg *arg, + unsigned long uargl) +{ + struct pm_scan_arg __user *uarg = (void __user *)uargl; + + if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end))) + return -EFAULT; + + return 0; +} + +static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) +{ + if (!p->arg.vec_len) + return 0; + + p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT, + p->arg.vec_len); + p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf), + GFP_KERNEL); + if (!p->vec_buf) + return -ENOMEM; + + p->vec_buf->start = p->vec_buf->end = 0; + p->vec_out = (struct page_region __user *)(long)p->arg.vec; + + return 0; +} + +static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p) +{ + const struct page_region *buf = p->vec_buf; + long n = p->vec_buf_index; + + if (!p->vec_buf) + return 0; + + if (buf[n].end != buf[n].start) + n++; + + if (!n) + return 0; + + if (copy_to_user(p->vec_out, buf, n * sizeof(*buf))) + return -EFAULT; + + p->arg.vec_len -= n; + p->vec_out += n; + + p->vec_buf_index = 0; + p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len); + p->vec_buf->start = p->vec_buf->end = 0; + + return n; +} + +static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) +{ + struct pagemap_scan_private p = {0}; + unsigned long walk_start; + size_t n_ranges_out = 0; + int ret; + + ret = pagemap_scan_get_args(&p.arg, uarg); + if (ret) + return ret; + + p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask | + p.arg.return_mask; + ret = pagemap_scan_init_bounce_buffer(&p); + if (ret) + return ret; + + for (walk_start = p.arg.start; walk_start < p.arg.end; + walk_start = p.arg.walk_end) { + struct mmu_notifier_range range; + long n_out; + + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + ret = mmap_read_lock_killable(mm); + if (ret) + break; + + /* Protection change for the range is going to happen. */ + if (p.arg.flags & PM_SCAN_WP_MATCHING) { + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, + mm, walk_start, p.arg.end); + mmu_notifier_invalidate_range_start(&range); + } + + ret = walk_page_range(mm, walk_start, p.arg.end, + &pagemap_scan_ops, &p); + + if (p.arg.flags & PM_SCAN_WP_MATCHING) + mmu_notifier_invalidate_range_end(&range); + + mmap_read_unlock(mm); + + n_out = pagemap_scan_flush_buffer(&p); + if (n_out < 0) + ret = n_out; + else + n_ranges_out += n_out; + + if (ret != -ENOSPC) + break; + + if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages) + break; + } + + /* ENOSPC signifies early stop (buffer full) from the walk. */ + if (!ret || ret == -ENOSPC) + ret = n_ranges_out; + + /* The walk_end isn't set when ret is zero */ + if (!p.arg.walk_end) + p.arg.walk_end = p.arg.end; + if (pagemap_scan_writeback_args(&p.arg, uarg)) + ret = -EFAULT; + + kfree(p.vec_buf); + return ret; +} + +static long do_pagemap_cmd(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct mm_struct *mm = file->private_data; + + switch (cmd) { + case PAGEMAP_SCAN: + return do_pagemap_scan(mm, arg); + + default: + return -EINVAL; + } +} + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, .open = pagemap_open, .release = pagemap_release, + .unlocked_ioctl = do_pagemap_cmd, + .compat_ioctl = do_pagemap_cmd, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ @@ -1792,28 +3124,34 @@ struct numa_maps_private { static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, unsigned long nr_pages) { - int count = page_mapcount(page); + struct folio *folio = page_folio(page); + int count; + + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + count = folio_precise_page_mapcount(folio, page); + else + count = folio_average_page_mapcount(folio); md->pages += nr_pages; - if (pte_dirty || PageDirty(page)) + if (pte_dirty || folio_test_dirty(folio)) md->dirty += nr_pages; - if (PageSwapCache(page)) + if (folio_test_swapcache(folio)) md->swapcache += nr_pages; - if (PageActive(page) || PageUnevictable(page)) + if (folio_test_active(folio) || folio_test_unevictable(folio)) md->active += nr_pages; - if (PageWriteback(page)) + if (folio_test_writeback(folio)) md->writeback += nr_pages; - if (PageAnon(page)) + if (folio_test_anon(folio)) md->anon += nr_pages; if (count > md->mapcount_max) md->mapcount_max = count; - md->node[page_to_nid(page)] += nr_pages; + md->node[folio_nid(folio)] += nr_pages; } static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, @@ -1886,16 +3224,18 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, spin_unlock(ptl); return 0; } - - if (pmd_trans_unstable(pmd)) - return 0; #endif orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return 0; + } do { - struct page *page = can_gather_numa_stats(*pte, vma, addr); + pte_t ptent = ptep_get(pte); + struct page *page = can_gather_numa_stats(ptent, vma, addr); if (!page) continue; - gather_stats(page, md, pte_dirty(*pte), 1); + gather_stats(page, md, pte_dirty(ptent), 1); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); @@ -1906,17 +3246,22 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { - pte_t huge_pte = huge_ptep_get(pte); + pte_t huge_pte; struct numa_maps *md; struct page *page; + spinlock_t *ptl; + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); + huge_pte = huge_ptep_get(walk->mm, addr, pte); if (!pte_present(huge_pte)) - return 0; + goto out; page = pte_page(huge_pte); md = walk->private; gather_stats(page, md, pte_dirty(huge_pte), 1); +out: + spin_unlock(ptl); return 0; } @@ -1931,6 +3276,7 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, static const struct mm_walk_ops show_numa_ops = { .hugetlb_entry = gather_hugetlb_stats, .pmd_entry = gather_pte_stats, + .walk_lock = PGWALK_RDLOCK, }; /* @@ -1944,8 +3290,9 @@ static int show_numa_map(struct seq_file *m, void *v) struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; - struct mempolicy *pol; char buffer[64]; + struct mempolicy *pol; + pgoff_t ilx; int nid; if (!mm) @@ -1954,7 +3301,7 @@ static int show_numa_map(struct seq_file *m, void *v) /* Ensure we start with an empty set of numa_maps statistics. */ memset(md, 0, sizeof(*md)); - pol = __get_vma_policy(vma, vma->vm_start); + pol = __get_vma_policy(vma, vma->vm_start, &ilx); if (pol) { mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); @@ -1966,10 +3313,10 @@ static int show_numa_map(struct seq_file *m, void *v) if (file) { seq_puts(m, " file="); - seq_file_path(m, file, "\n\t= "); - } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_path(m, file_user_path(file), "\n\t= "); + } else if (vma_is_initial_heap(vma)) { seq_puts(m, " heap"); - } else if (is_stack(vma)) { + } else if (vma_is_initial_stack(vma)) { seq_puts(m, " stack"); } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 2fd06f52b6a4..d362919f4f68 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -38,7 +38,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) } if (atomic_read(&mm->mm_count) > 1 || - vma->vm_flags & VM_MAYSHARE) { + is_nommu_shared_mapping(vma->vm_flags)) { sbytes += size; } else { bytes += size; @@ -51,7 +51,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) sbytes += kobjsize(mm); else bytes += kobjsize(mm); - + if (current->fs && current->fs->users > 1) sbytes += kobjsize(current->fs); else @@ -69,13 +69,13 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) bytes += kobjsize(current); /* includes kernel stack */ + mmap_read_unlock(mm); + seq_printf(m, "Mem:\t%8lu bytes\n" "Slack:\t%8lu bytes\n" "Shared:\t%8lu bytes\n", bytes, slack, sbytes); - - mmap_read_unlock(mm); } unsigned long task_vsize(struct mm_struct *mm) @@ -121,19 +121,6 @@ unsigned long task_statm(struct mm_struct *mm, return size; } -static int is_stack(struct vm_area_struct *vma) -{ - struct mm_struct *mm = vma->vm_mm; - - /* - * We make no effort to guess what a given thread considers to be - * its "stack". It's not even well-defined for programs written - * languages like Go. - */ - return vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack; -} - /* * display a single VMA to a sequenced file */ @@ -170,8 +157,8 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) if (file) { seq_pad(m, ' '); - seq_file_path(m, file, ""); - } else if (mm && is_stack(vma)) { + seq_path(m, file_user_path(file), ""); + } else if (mm && vma_is_initial_stack(vma)) { seq_pad(m, ' '); seq_puts(m, "[stack]"); } @@ -188,15 +175,28 @@ static int show_map(struct seq_file *m, void *_p) return nommu_vma_show(m, _p); } -static void *m_start(struct seq_file *m, loff_t *pos) +static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, + loff_t *ppos) +{ + struct vm_area_struct *vma = vma_next(&priv->iter); + + if (vma) { + *ppos = vma->vm_start; + } else { + *ppos = -1UL; + } + + return vma; +} + +static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; + unsigned long last_addr = *ppos; struct mm_struct *mm; - struct vm_area_struct *vma; - unsigned long addr = *pos; - /* See m_next(). Zero at the start or after lseek. */ - if (addr == -1UL) + /* See proc_get_vma(). Zero at the start or after lseek. */ + if (last_addr == -1UL) return NULL; /* pin the task and mm whilst we play with them */ @@ -204,45 +204,42 @@ static void *m_start(struct seq_file *m, loff_t *pos) if (!priv->task) return ERR_PTR(-ESRCH); - mm = priv->mm; - if (!mm || !mmget_not_zero(mm)) + mm = priv->lock_ctx.mm; + if (!mm || !mmget_not_zero(mm)) { + put_task_struct(priv->task); + priv->task = NULL; return NULL; + } if (mmap_read_lock_killable(mm)) { mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; return ERR_PTR(-EINTR); } - /* start the next element from addr */ - vma = find_vma(mm, addr); - if (vma) - return vma; + vma_iter_init(&priv->iter, mm, last_addr); - mmap_read_unlock(mm); - mmput(mm); - return NULL; + return proc_get_vma(priv, ppos); } -static void m_stop(struct seq_file *m, void *_vml) +static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; + struct mm_struct *mm = priv->lock_ctx.mm; - if (!IS_ERR_OR_NULL(_vml)) { - mmap_read_unlock(priv->mm); - mmput(priv->mm); - } - if (priv->task) { - put_task_struct(priv->task); - priv->task = NULL; - } + if (!priv->task) + return; + + mmap_read_unlock(mm); + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; } -static void *m_next(struct seq_file *m, void *_p, loff_t *pos) +static void *m_next(struct seq_file *m, void *_p, loff_t *ppos) { - struct vm_area_struct *vma = _p; - - *pos = vma->vm_end; - return find_vma(vma->vm_mm, vma->vm_end); + return proc_get_vma(m->private, ppos); } static const struct seq_operations proc_pid_maps_ops = { @@ -262,9 +259,9 @@ static int maps_open(struct inode *inode, struct file *file, return -ENOMEM; priv->inode = inode; - priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - int err = PTR_ERR(priv->mm); + priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) { + int err = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH; seq_release_private(inode, file); return err; @@ -279,8 +276,8 @@ static int map_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; - if (priv->mm) - mmdrop(priv->mm); + if (priv->lock_ctx.mm) + mmdrop(priv->lock_ctx.mm); return seq_release_private(inode, file); } diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index a553273fbd41..d6113dbe58e0 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -31,12 +31,11 @@ static const struct inode_operations proc_thread_self_inode_operations = { .get_link = proc_thread_self_get_link, }; -static unsigned thread_self_inum __ro_after_init; +unsigned thread_self_inum __ro_after_init; int proc_setup_thread_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct proc_fs_info *fs_info = proc_sb_info(s); struct dentry *thread_self; int ret = -ENOMEM; @@ -46,24 +45,20 @@ int proc_setup_thread_self(struct super_block *s) struct inode *inode = new_inode(s); if (inode) { inode->i_ino = thread_self_inum; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_thread_self_inode_operations; - d_add(thread_self, inode); + d_make_persistent(thread_self, inode); ret = 0; - } else { - dput(thread_self); } + dput(thread_self); } inode_unlock(root_inode); if (ret) pr_err("proc_fill_super: can't allocate /proc/thread-self\n"); - else - fs_info->proc_thread_self = thread_self; - return ret; } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 09a81e4b1273..f188bd900eb2 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -8,6 +8,8 @@ * */ +#define pr_fmt(fmt) "vmcore: " fmt + #include <linux/mm.h> #include <linux/kcore.h> #include <linux/user.h> @@ -51,9 +53,14 @@ static u64 vmcore_size; static struct proc_dir_entry *proc_vmcore; #ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +struct vmcoredd_node { + struct list_head list; /* List of dumps */ + void *buf; /* Buffer containing device's dump */ + unsigned int size; /* Size of the buffer */ +}; + /* Device Dump list and mutex to synchronize access to list */ static LIST_HEAD(vmcoredd_list); -static DEFINE_MUTEX(vmcoredd_mutex); static bool vmcoredd_disabled; core_param(novmcoredd, vmcoredd_disabled, bool, 0); @@ -62,17 +69,22 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0); /* Device Dump Size */ static size_t vmcoredd_orig_sz; -static DEFINE_SPINLOCK(vmcore_cb_lock); +static DEFINE_MUTEX(vmcore_mutex); + DEFINE_STATIC_SRCU(vmcore_cb_srcu); /* List of registered vmcore callbacks. */ static LIST_HEAD(vmcore_cb_list); /* Whether the vmcore has been opened once. */ static bool vmcore_opened; +/* Whether the vmcore is currently open. */ +static unsigned int vmcore_open; + +static void vmcore_process_device_ram(struct vmcore_cb *cb); void register_vmcore_cb(struct vmcore_cb *cb) { INIT_LIST_HEAD(&cb->next); - spin_lock(&vmcore_cb_lock); + mutex_lock(&vmcore_mutex); list_add_tail(&cb->next, &vmcore_cb_list); /* * Registering a vmcore callback after the vmcore was opened is @@ -80,13 +92,15 @@ void register_vmcore_cb(struct vmcore_cb *cb) */ if (vmcore_opened) pr_warn_once("Unexpected vmcore callback registration\n"); - spin_unlock(&vmcore_cb_lock); + if (!vmcore_open && cb->get_device_ram) + vmcore_process_device_ram(cb); + mutex_unlock(&vmcore_mutex); } EXPORT_SYMBOL_GPL(register_vmcore_cb); void unregister_vmcore_cb(struct vmcore_cb *cb) { - spin_lock(&vmcore_cb_lock); + mutex_lock(&vmcore_mutex); list_del_rcu(&cb->next); /* * Unregistering a vmcore callback after the vmcore was opened is @@ -95,7 +109,7 @@ void unregister_vmcore_cb(struct vmcore_cb *cb) */ if (vmcore_opened) pr_warn_once("Unexpected vmcore callback unregistration\n"); - spin_unlock(&vmcore_cb_lock); + mutex_unlock(&vmcore_mutex); synchronize_srcu(&vmcore_cb_srcu); } @@ -120,9 +134,23 @@ static bool pfn_is_ram(unsigned long pfn) static int open_vmcore(struct inode *inode, struct file *file) { - spin_lock(&vmcore_cb_lock); + mutex_lock(&vmcore_mutex); vmcore_opened = true; - spin_unlock(&vmcore_cb_lock); + if (vmcore_open + 1 == 0) { + mutex_unlock(&vmcore_mutex); + return -EBUSY; + } + vmcore_open++; + mutex_unlock(&vmcore_mutex); + + return 0; +} + +static int release_vmcore(struct inode *inode, struct file *file) +{ + mutex_lock(&vmcore_mutex); + vmcore_open--; + mutex_unlock(&vmcore_mutex); return 0; } @@ -132,7 +160,7 @@ ssize_t read_from_oldmem(struct iov_iter *iter, size_t count, u64 *ppos, bool encrypted) { unsigned long pfn, offset; - size_t nr_bytes; + ssize_t nr_bytes; ssize_t read = 0, tmp; int idx; @@ -243,33 +271,27 @@ static int vmcoredd_copy_dumps(struct iov_iter *iter, u64 start, size_t size) { struct vmcoredd_node *dump; u64 offset = 0; - int ret = 0; size_t tsz; char *buf; - mutex_lock(&vmcoredd_mutex); list_for_each_entry(dump, &vmcoredd_list, list) { if (start < offset + dump->size) { tsz = min(offset + (u64)dump->size - start, (u64)size); buf = dump->buf + start - offset; - if (copy_to_iter(buf, tsz, iter) < tsz) { - ret = -EFAULT; - goto out_unlock; - } + if (copy_to_iter(buf, tsz, iter) < tsz) + return -EFAULT; size -= tsz; start += tsz; /* Leave now if buffer filled already */ if (!size) - goto out_unlock; + return 0; } offset += dump->size; } -out_unlock: - mutex_unlock(&vmcoredd_mutex); - return ret; + return 0; } #ifdef CONFIG_MMU @@ -278,20 +300,16 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst, { struct vmcoredd_node *dump; u64 offset = 0; - int ret = 0; size_t tsz; char *buf; - mutex_lock(&vmcoredd_mutex); list_for_each_entry(dump, &vmcoredd_list, list) { if (start < offset + dump->size) { tsz = min(offset + (u64)dump->size - start, (u64)size); buf = dump->buf + start - offset; if (remap_vmalloc_range_partial(vma, dst, buf, 0, - tsz)) { - ret = -EFAULT; - goto out_unlock; - } + tsz)) + return -EFAULT; size -= tsz; start += tsz; @@ -299,14 +317,12 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst, /* Leave now if buffer filled already */ if (!size) - goto out_unlock; + return 0; } offset += dump->size; } -out_unlock: - mutex_unlock(&vmcoredd_mutex); - return ret; + return 0; } #endif /* CONFIG_MMU */ #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ @@ -316,10 +332,10 @@ out_unlock: */ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos) { + struct vmcore_range *m = NULL; ssize_t acc = 0, tmp; size_t tsz; u64 start; - struct vmcore *m = NULL; if (!iov_iter_count(iter) || *fpos >= vmcore_size) return 0; @@ -339,7 +355,7 @@ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos) return acc; } - /* Read Elf note segment */ + /* Read ELF note segment */ if (*fpos < elfcorebuf_sz + elfnotes_sz) { void *kaddr; @@ -383,6 +399,8 @@ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos) /* leave now if filled buffer already */ if (!iov_iter_count(iter)) return acc; + + cond_resched(); } list_for_each_entry(m, &vmcore_list, list) { @@ -402,6 +420,8 @@ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos) if (!iov_iter_count(iter)) return acc; } + + cond_resched(); } return acc; @@ -412,6 +432,34 @@ static ssize_t read_vmcore(struct kiocb *iocb, struct iov_iter *iter) return __read_vmcore(iter, &iocb->ki_pos); } +/** + * vmcore_alloc_buf - allocate buffer in vmalloc memory + * @size: size of buffer + * + * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap + * the buffer to user-space by means of remap_vmalloc_range(). + * + * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is + * disabled and there's no need to allow users to mmap the buffer. + */ +static inline char *vmcore_alloc_buf(size_t size) +{ +#ifdef CONFIG_MMU + return vmalloc_user(size); +#else + return vzalloc(size); +#endif +} + +/* + * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is + * essential for mmap_vmcore() in order to map physically + * non-contiguous objects (ELF header, ELF note segment and memory + * regions in the 1st kernel pointed to by PT_LOAD entries) into + * virtually contiguous user-space in ELF layout. + */ +#ifdef CONFIG_MMU + /* * The vmcore fault handler uses the page cache and fills data using the * standard __read_vmcore() function. @@ -459,33 +507,6 @@ static const struct vm_operations_struct vmcore_mmap_ops = { .fault = mmap_vmcore_fault, }; -/** - * vmcore_alloc_buf - allocate buffer in vmalloc memory - * @size: size of buffer - * - * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap - * the buffer to user-space by means of remap_vmalloc_range(). - * - * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is - * disabled and there's no need to allow users to mmap the buffer. - */ -static inline char *vmcore_alloc_buf(size_t size) -{ -#ifdef CONFIG_MMU - return vmalloc_user(size); -#else - return vzalloc(size); -#endif -} - -/* - * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is - * essential for mmap_vmcore() in order to map physically - * non-contiguous objects (ELF header, ELF note segment and memory - * regions in the 1st kernel pointed to by PT_LOAD entries) into - * virtually contiguous user-space in ELF layout. - */ -#ifdef CONFIG_MMU /* * remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages * reported as not being ram with the zero page. @@ -571,7 +592,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) { size_t size = vma->vm_end - vma->vm_start; u64 start, end, len, tsz; - struct vmcore *m; + struct vmcore_range *m; start = (u64)vma->vm_pgoff << PAGE_SHIFT; end = start + size; @@ -582,8 +603,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; - vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_mod(vma, VM_MIXEDMAP, VM_MAYWRITE | VM_MAYEXEC); vma->vm_ops = &vmcore_mmap_ops; len = 0; @@ -689,21 +709,17 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) static const struct proc_ops vmcore_proc_ops = { .proc_open = open_vmcore, + .proc_release = release_vmcore, .proc_read_iter = read_vmcore, .proc_lseek = default_llseek, .proc_mmap = mmap_vmcore, }; -static struct vmcore* __init get_new_element(void) -{ - return kzalloc(sizeof(struct vmcore), GFP_KERNEL); -} - static u64 get_vmcore_size(size_t elfsz, size_t elfnotesegsz, struct list_head *vc_list) { + struct vmcore_range *m; u64 size; - struct vmcore *m; size = elfsz + elfnotesegsz; list_for_each_entry(m, vc_list, list) { @@ -878,7 +894,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, phdr.p_offset = roundup(note_off, PAGE_SIZE); phdr.p_vaddr = phdr.p_paddr = 0; phdr.p_filesz = phdr.p_memsz = phdr_sz; - phdr.p_align = 0; + phdr.p_align = 4; /* Add merged PT_NOTE program header*/ tmp = elfptr + sizeof(Elf64_Ehdr); @@ -1069,7 +1085,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, phdr.p_offset = roundup(note_off, PAGE_SIZE); phdr.p_vaddr = phdr.p_paddr = 0; phdr.p_filesz = phdr.p_memsz = phdr_sz; - phdr.p_align = 0; + phdr.p_align = 4; /* Add merged PT_NOTE program header*/ tmp = elfptr + sizeof(Elf32_Ehdr); @@ -1105,12 +1121,11 @@ static int __init process_ptload_program_headers_elf64(char *elfptr, Elf64_Ehdr *ehdr_ptr; Elf64_Phdr *phdr_ptr; loff_t vmcore_off; - struct vmcore *new; ehdr_ptr = (Elf64_Ehdr *)elfptr; phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ - /* Skip Elf header, program headers and Elf note segment. */ + /* Skip ELF header, program headers and ELF note segment. */ vmcore_off = elfsz + elfnotes_sz; for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { @@ -1124,13 +1139,8 @@ static int __init process_ptload_program_headers_elf64(char *elfptr, end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE); size = end - start; - /* Add this contiguous chunk of memory to vmcore list.*/ - new = get_new_element(); - if (!new) + if (vmcore_alloc_add_range(vc_list, start, size)) return -ENOMEM; - new->paddr = start; - new->size = size; - list_add_tail(&new->list, vc_list); /* Update the program header offset. */ phdr_ptr->p_offset = vmcore_off + (paddr - start); @@ -1148,12 +1158,11 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, Elf32_Ehdr *ehdr_ptr; Elf32_Phdr *phdr_ptr; loff_t vmcore_off; - struct vmcore *new; ehdr_ptr = (Elf32_Ehdr *)elfptr; phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ - /* Skip Elf header, program headers and Elf note segment. */ + /* Skip ELF header, program headers and ELF note segment. */ vmcore_off = elfsz + elfnotes_sz; for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { @@ -1167,13 +1176,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE); size = end - start; - /* Add this contiguous chunk of memory to vmcore list.*/ - new = get_new_element(); - if (!new) + if (vmcore_alloc_add_range(vc_list, start, size)) return -ENOMEM; - new->paddr = start; - new->size = size; - list_add_tail(&new->list, vc_list); /* Update the program header offset */ phdr_ptr->p_offset = vmcore_off + (paddr - start); @@ -1186,10 +1190,10 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, struct list_head *vc_list) { + struct vmcore_range *m; loff_t vmcore_off; - struct vmcore *m; - /* Skip Elf header, program headers and Elf note segment. */ + /* Skip ELF header, program headers and ELF note segment. */ vmcore_off = elfsz + elfnotes_sz; list_for_each_entry(m, vc_list, list) { @@ -1214,7 +1218,7 @@ static int __init parse_crash_elf64_headers(void) addr = elfcorehdr_addr; - /* Read Elf header */ + /* Read ELF header */ rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr); if (rc < 0) return rc; @@ -1270,7 +1274,7 @@ static int __init parse_crash_elf32_headers(void) addr = elfcorehdr_addr; - /* Read Elf header */ + /* Read ELF header */ rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr); if (rc < 0) return rc; @@ -1371,18 +1375,17 @@ static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data, vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name); vdd_hdr->n_type = NT_VMCOREDD; - strncpy((char *)vdd_hdr->name, VMCOREDD_NOTE_NAME, - sizeof(vdd_hdr->name)); - memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name)); + strscpy_pad(vdd_hdr->name, VMCOREDD_NOTE_NAME); + strscpy_pad(vdd_hdr->dump_name, data->dump_name); } /** - * vmcoredd_update_program_headers - Update all Elf program headers + * vmcoredd_update_program_headers - Update all ELF program headers * @elfptr: Pointer to elf header * @elfnotesz: Size of elf notes aligned to page size * @vmcoreddsz: Size of device dumps to be added to elf note header * - * Determine type of Elf header (Elf64 or Elf32) and update the elf note size. + * Determine type of ELF header (Elf64 or Elf32) and update the elf note size. * Also update the offsets of all the program headers after the elf note header. */ static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz, @@ -1440,10 +1443,10 @@ static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz, /** * vmcoredd_update_size - Update the total size of the device dumps and update - * Elf header + * ELF header * @dump_size: Size of the current device dump to be added to total size * - * Update the total size of all the device dumps and update the Elf program + * Update the total size of all the device dumps and update the ELF program * headers. Calculate the new offsets for the vmcore list and update the * total vmcore size. */ @@ -1467,7 +1470,7 @@ static void vmcoredd_update_size(size_t dump_size) * @data: dump info. * * Allocate a buffer and invoke the calling driver's dump collect routine. - * Write Elf note at the beginning of the buffer to indicate vmcore device + * Write ELF note at the beginning of the buffer to indicate vmcore device * dump and add the dump to global list. */ int vmcore_add_device_dump(struct vmcoredd_data *data) @@ -1487,10 +1490,8 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) return -EINVAL; dump = vzalloc(sizeof(*dump)); - if (!dump) { - ret = -ENOMEM; - goto out_err; - } + if (!dump) + return -ENOMEM; /* Keep size of the buffer page aligned so that it can be mmaped */ data_size = roundup(sizeof(struct vmcoredd_header) + data->size, @@ -1515,12 +1516,18 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) dump->buf = buf; dump->size = data_size; - /* Add the dump to driver sysfs list */ - mutex_lock(&vmcoredd_mutex); - list_add_tail(&dump->list, &vmcoredd_list); - mutex_unlock(&vmcoredd_mutex); + /* Add the dump to driver sysfs list and update the elfcore hdr */ + scoped_guard(mutex, &vmcore_mutex) { + if (vmcore_opened) + pr_warn_once("Unexpected adding of device dump\n"); + if (vmcore_open) { + ret = -EBUSY; + goto out_err; + } - vmcoredd_update_size(data_size); + list_add_tail(&dump->list, &vmcoredd_list); + vmcoredd_update_size(data_size); + } return 0; out_err: @@ -1532,11 +1539,163 @@ out_err: EXPORT_SYMBOL(vmcore_add_device_dump); #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ +#ifdef CONFIG_PROC_VMCORE_DEVICE_RAM +static int vmcore_realloc_elfcore_buffer_elf64(size_t new_size) +{ + char *elfcorebuf_new; + + if (WARN_ON_ONCE(new_size < elfcorebuf_sz)) + return -EINVAL; + if (get_order(elfcorebuf_sz_orig) == get_order(new_size)) { + elfcorebuf_sz_orig = new_size; + return 0; + } + + elfcorebuf_new = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(new_size)); + if (!elfcorebuf_new) + return -ENOMEM; + memcpy(elfcorebuf_new, elfcorebuf, elfcorebuf_sz); + free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig)); + elfcorebuf = elfcorebuf_new; + elfcorebuf_sz_orig = new_size; + return 0; +} + +static void vmcore_reset_offsets_elf64(void) +{ + Elf64_Phdr *phdr_start = (Elf64_Phdr *)(elfcorebuf + sizeof(Elf64_Ehdr)); + loff_t vmcore_off = elfcorebuf_sz + elfnotes_sz; + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfcorebuf; + Elf64_Phdr *phdr; + int i; + + for (i = 0, phdr = phdr_start; i < ehdr->e_phnum; i++, phdr++) { + u64 start, end; + + /* + * After merge_note_headers_elf64() we should only have a single + * PT_NOTE entry that starts immediately after elfcorebuf_sz. + */ + if (phdr->p_type == PT_NOTE) { + phdr->p_offset = elfcorebuf_sz; + continue; + } + + start = rounddown(phdr->p_offset, PAGE_SIZE); + end = roundup(phdr->p_offset + phdr->p_memsz, PAGE_SIZE); + phdr->p_offset = vmcore_off + (phdr->p_offset - start); + vmcore_off = vmcore_off + end - start; + } + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); +} + +static int vmcore_add_device_ram_elf64(struct list_head *list, size_t count) +{ + Elf64_Phdr *phdr_start = (Elf64_Phdr *)(elfcorebuf + sizeof(Elf64_Ehdr)); + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfcorebuf; + struct vmcore_range *cur; + Elf64_Phdr *phdr; + size_t new_size; + int rc; + + if ((Elf32_Half)(ehdr->e_phnum + count) != ehdr->e_phnum + count) { + pr_err("too many device ram ranges\n"); + return -ENOSPC; + } + + /* elfcorebuf_sz must always cover full pages. */ + new_size = sizeof(Elf64_Ehdr) + + (ehdr->e_phnum + count) * sizeof(Elf64_Phdr); + new_size = roundup(new_size, PAGE_SIZE); + + /* + * Make sure we have sufficient space to include the new PT_LOAD + * entries. + */ + rc = vmcore_realloc_elfcore_buffer_elf64(new_size); + if (rc) { + pr_err("resizing elfcore failed\n"); + return rc; + } + + /* Modify our used elfcore buffer size to cover the new entries. */ + elfcorebuf_sz = new_size; + + /* Fill the added PT_LOAD entries. */ + phdr = phdr_start + ehdr->e_phnum; + list_for_each_entry(cur, list, list) { + WARN_ON_ONCE(!IS_ALIGNED(cur->paddr | cur->size, PAGE_SIZE)); + elfcorehdr_fill_device_ram_ptload_elf64(phdr, cur->paddr, cur->size); + + /* p_offset will be adjusted later. */ + phdr++; + ehdr->e_phnum++; + } + list_splice_tail(list, &vmcore_list); + + /* We changed elfcorebuf_sz and added new entries; reset all offsets. */ + vmcore_reset_offsets_elf64(); + + /* Finally, recalculate the total vmcore size. */ + vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz, + &vmcore_list); + proc_vmcore->size = vmcore_size; + return 0; +} + +static void vmcore_process_device_ram(struct vmcore_cb *cb) +{ + unsigned char *e_ident = (unsigned char *)elfcorebuf; + struct vmcore_range *first, *m; + LIST_HEAD(list); + int count; + + /* We only support Elf64 dumps for now. */ + if (WARN_ON_ONCE(e_ident[EI_CLASS] != ELFCLASS64)) { + pr_err("device ram ranges only support Elf64\n"); + return; + } + + if (cb->get_device_ram(cb, &list)) { + pr_err("obtaining device ram ranges failed\n"); + return; + } + count = list_count_nodes(&list); + if (!count) + return; + + /* + * For some reason these ranges are already know? Might happen + * with unusual register->unregister->register sequences; we'll simply + * sanity check using the first range. + */ + first = list_first_entry(&list, struct vmcore_range, list); + list_for_each_entry(m, &vmcore_list, list) { + unsigned long long m_end = m->paddr + m->size; + unsigned long long first_end = first->paddr + first->size; + + if (first->paddr < m_end && m->paddr < first_end) + goto out_free; + } + + /* If adding the mem nodes succeeds, they must not be freed. */ + if (!vmcore_add_device_ram_elf64(&list, count)) + return; +out_free: + vmcore_free_ranges(&list); +} +#else /* !CONFIG_PROC_VMCORE_DEVICE_RAM */ +static void vmcore_process_device_ram(struct vmcore_cb *cb) +{ +} +#endif /* CONFIG_PROC_VMCORE_DEVICE_RAM */ + /* Free all dumps in vmcore device dump list */ static void vmcore_free_device_dumps(void) { #ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP - mutex_lock(&vmcoredd_mutex); + mutex_lock(&vmcore_mutex); while (!list_empty(&vmcoredd_list)) { struct vmcoredd_node *dump; @@ -1546,7 +1705,7 @@ static void vmcore_free_device_dumps(void) vfree(dump->buf); vfree(dump); } - mutex_unlock(&vmcoredd_mutex); + mutex_unlock(&vmcore_mutex); #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ } @@ -1568,7 +1727,7 @@ static int __init vmcore_init(void) rc = parse_crash_elf_headers(); if (rc) { elfcorehdr_free(elfcorehdr_addr); - pr_warn("Kdump: vmcore not initialized\n"); + pr_warn("not initialized\n"); return rc; } elfcorehdr_free(elfcorehdr_addr); @@ -1589,14 +1748,7 @@ void vmcore_cleanup(void) proc_vmcore = NULL; } - /* clear the vmcore list. */ - while (!list_empty(&vmcore_list)) { - struct vmcore *m; - - m = list_first_entry(&vmcore_list, struct vmcore, list); - list_del(&m->list); - kfree(m); - } + vmcore_free_ranges(&vmcore_list); free_elfcorebuf(); /* clear vmcore device dump list */ |
