diff options
Diffstat (limited to 'fs/proc')
36 files changed, 7552 insertions, 3747 deletions
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 1ade1206bb89..6ae966c561e7 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only config PROC_FS bool "/proc file system support" if EXPERT default y @@ -22,7 +23,7 @@ config PROC_FS /proc" or the equivalent line in /etc/fstab does the job. The /proc file system is explained in the file - <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage + <file:Documentation/filesystems/proc.rst> and on the proc(5) manpage ("man 5 proc"). This option will enlarge your kernel by about 67 KB. Several @@ -31,6 +32,7 @@ config PROC_FS config PROC_KCORE bool "/proc/kcore support" if !ARM depends on PROC_FS && MMU + select VMCORE_INFO help Provides a virtual ELF core file of the live kernel. This can be read with gdb and other ELF tools. No modifications can be @@ -40,22 +42,57 @@ config PROC_VMCORE bool "/proc/vmcore support" depends on PROC_FS && CRASH_DUMP default y - help - Exports the dump image of crashed kernel in ELF format. + help + Exports the dump image of crashed kernel in ELF format. + +config PROC_VMCORE_DEVICE_DUMP + bool "Device Hardware/Firmware Log Collection" + depends on PROC_VMCORE + default n + help + After kernel panic, device drivers can collect the device + specific snapshot of their hardware or firmware before the + underlying devices are initialized in crash recovery kernel. + Note that the device driver must be present in the crash + recovery kernel's initramfs to collect its underlying device + snapshot. + + If you say Y here, the collected device dumps will be added + as ELF notes to /proc/vmcore. You can still disable device + dump using the kernel command line option 'novmcoredd'. + +config NEED_PROC_VMCORE_DEVICE_RAM + bool + +config PROC_VMCORE_DEVICE_RAM + def_bool y + depends on PROC_VMCORE && NEED_PROC_VMCORE_DEVICE_RAM + depends on VIRTIO_MEM + help + If the elfcore hdr is allocated and prepared by the dump kernel + ("2nd kernel") instead of the crashed kernel, RAM provided by memory + devices such as virtio-mem will not be included in the dump + image, because only the device driver can properly detect them. + + With this config enabled, these RAM ranges will be queried from the + device drivers once the device gets probed, so they can be included + in the crash dump. + + Relevant architectures should select NEED_PROC_VMCORE_DEVICE_RAM. config PROC_SYSCTL bool "Sysctl support (/proc/sys)" if EXPERT depends on PROC_FS select SYSCTL default y - ---help--- + help The sysctl interface provides a means of dynamically changing certain kernel parameters and variables on the fly without requiring a recompile of the kernel or reboot of the system. The primary interface is through /proc/sys. If you say Y here a tree of modifiable sysctl entries will be generated beneath the - /proc/sys directory. They are explained in the files - in <file:Documentation/sysctl/>. Note that enabling this + /proc/sys directory. They are explained in the files + in <file:Documentation/admin-guide/sysctl/>. Note that enabling this option will enlarge the kernel by at least 8 KB. As it is generally a good thing, you should say Y here unless @@ -70,14 +107,23 @@ config PROC_PAGE_MONITOR Various /proc files exist to monitor process memory utilization: /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, /proc/kpagecount, and /proc/kpageflags. Disabling these - interfaces will reduce the size of the kernel by approximately 4kb. + interfaces will reduce the size of the kernel by approximately 4kb. config PROC_CHILDREN bool "Include /proc/<pid>/task/<tid>/children file" + depends on PROC_FS default n help Provides a fast way to retrieve first level children pids of a task. See - <file:Documentation/filesystems/proc.txt> for more information. + <file:Documentation/filesystems/proc.rst> for more information. Say Y if you are running any user-space software which takes benefit from this interface. For example, rkt is such a piece of software. + +config PROC_PID_ARCH_STATUS + def_bool n + depends on PROC_FS + +config PROC_CPU_RESCTRL + def_bool n + depends on PROC_FS diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 12c6922c913c..7b4db9c56e6a 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -1,10 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the Linux proc filesystem routines. # obj-y += proc.o -CFLAGS_task_mmu.o += $(call cc-option,-Wno-override-init,) +CFLAGS_task_mmu.o += -Wno-override-init proc-y := nommu.o task_nommu.o proc-$(CONFIG_MMU) := task_mmu.o @@ -20,6 +21,7 @@ proc-y += loadavg.o proc-y += meminfo.o proc-y += stat.o proc-y += uptime.o +proc-y += util.o proc-y += version.o proc-y += softirqs.o proc-y += namespaces.o @@ -31,3 +33,4 @@ proc-$(CONFIG_PROC_KCORE) += kcore.o proc-$(CONFIG_PROC_VMCORE) += vmcore.o proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o +proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o diff --git a/fs/proc/array.c b/fs/proc/array.c index 88c355574aa0..42932f88141a 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/array.c * @@ -55,6 +56,7 @@ #include <linux/types.h> #include <linux/errno.h> #include <linux/time.h> +#include <linux/time_namespace.h> #include <linux/kernel.h> #include <linux/kernel_stat.h> #include <linux/tty.h> @@ -62,11 +64,11 @@ #include <linux/mman.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> +#include <linux/sched/task_stack.h> #include <linux/sched/task.h> #include <linux/sched/cputime.h> #include <linux/proc_fs.h> #include <linux/ioport.h> -#include <linux/uaccess.h> #include <linux/io.h> #include <linux/mm.h> #include <linux/hugetlb.h> @@ -83,32 +85,36 @@ #include <linux/delayacct.h> #include <linux/seq_file.h> #include <linux/pid_namespace.h> +#include <linux/prctl.h> #include <linux/ptrace.h> -#include <linux/tracehook.h> #include <linux/string_helpers.h> #include <linux/user_namespace.h> #include <linux/fs_struct.h> +#include <linux/kthread.h> +#include <linux/mmu_context.h> -#include <asm/pgtable.h> #include <asm/processor.h> #include "internal.h" -static inline void task_name(struct seq_file *m, struct task_struct *p) +void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) { - char *buf; - size_t size; - char tcomm[sizeof(p->comm)]; - int ret; + char tcomm[64]; - get_task_comm(tcomm, p); - - seq_puts(m, "Name:\t"); - - size = seq_get_buf(m, &buf); - ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); - seq_commit(m, ret < size ? ret : -1); + /* + * Test before PF_KTHREAD because all workqueue worker threads are + * kernel threads. + */ + if (p->flags & PF_WQ_WORKER) + wq_worker_comm(tcomm, sizeof(tcomm), p); + else if (p->flags & PF_KTHREAD) + get_kthread_comm(tcomm, sizeof(tcomm), p); + else + get_task_comm(tcomm, p); - seq_putc(m, '\n'); + if (escape) + seq_escape_str(m, tcomm, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); + else + seq_printf(m, "%.64s", tcomm); } /* @@ -118,43 +124,25 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) * simple bit tests. */ static const char * const task_state_array[] = { - "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "t (tracing stop)", /* 8 */ - "X (dead)", /* 16 */ - "Z (zombie)", /* 32 */ + + /* states in TASK_REPORT: */ + "R (running)", /* 0x00 */ + "S (sleeping)", /* 0x01 */ + "D (disk sleep)", /* 0x02 */ + "T (stopped)", /* 0x04 */ + "t (tracing stop)", /* 0x08 */ + "X (dead)", /* 0x10 */ + "Z (zombie)", /* 0x20 */ + "P (parked)", /* 0x40 */ + + /* states beyond TASK_REPORT: */ + "I (idle)", /* 0x80 */ }; static inline const char *get_task_state(struct task_struct *tsk) { - unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT; - - /* - * Parked tasks do not run; they sit in __kthread_parkme(). - * Without this check, we would report them as running, which is - * clearly wrong, so we report them as sleeping instead. - */ - if (tsk->state == TASK_PARKED) - state = TASK_INTERRUPTIBLE; - - BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1); - - return task_state_array[fls(state)]; -} - -static inline int get_task_umask(struct task_struct *tsk) -{ - struct fs_struct *fs; - int umask = -ENOENT; - - task_lock(tsk); - fs = tsk->fs; - if (fs) - umask = fs->umask; - task_unlock(tsk); - return umask; + BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array)); + return task_state_array[task_state_index(tsk)]; } static inline void task_state(struct seq_file *m, struct pid_namespace *ns, @@ -162,35 +150,34 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, { struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; - int g, umask; + int g, umask = -1; struct task_struct *tracer; const struct cred *cred; pid_t ppid, tpid = 0, tgid, ngid; unsigned int max_fds = 0; rcu_read_lock(); - ppid = pid_alive(p) ? - task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; - tracer = ptrace_parent(p); if (tracer) tpid = task_pid_nr_ns(tracer, ns); + ppid = task_ppid_nr_ns(p, ns); tgid = task_tgid_nr_ns(p, ns); ngid = task_numa_group_id(p); cred = get_task_cred(p); - umask = get_task_umask(p); - if (umask >= 0) - seq_printf(m, "Umask:\t%#04o\n", umask); - task_lock(p); + if (p->fs) + umask = p->fs->umask; if (p->files) max_fds = files_fdtable(p->files)->max_fds; task_unlock(p); rcu_read_unlock(); - seq_printf(m, "State:\t%s", get_task_state(p)); + if (umask >= 0) + seq_printf(m, "Umask:\t%#04o\n", umask); + seq_puts(m, "State:\t"); + seq_puts(m, get_task_state(p)); seq_put_decimal_ull(m, "\nTgid:\t", tgid); seq_put_decimal_ull(m, "\nNgid:\t", ngid); @@ -231,6 +218,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns)); #endif seq_putc(m, '\n'); + + seq_printf(m, "Kthread:\t%c\n", p->flags & PF_KTHREAD ? '1' : '0'); } void render_sigset_t(struct seq_file *m, const char *header, @@ -255,8 +244,8 @@ void render_sigset_t(struct seq_file *m, const char *header, seq_putc(m, '\n'); } -static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, - sigset_t *catch) +static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *sigign, + sigset_t *sigcatch) { struct k_sigaction *k; int i; @@ -264,9 +253,9 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, k = p->sighand->action; for (i = 1; i <= _NSIG; ++i, ++k) { if (k->sa.sa_handler == SIG_IGN) - sigaddset(ign, i); + sigaddset(sigign, i); else if (k->sa.sa_handler != SIG_DFL) - sigaddset(catch, i); + sigaddset(sigcatch, i); } } @@ -275,7 +264,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) unsigned long flags; sigset_t pending, shpending, blocked, ignored, caught; int num_threads = 0; - unsigned long qsize = 0; + unsigned int qsize = 0; unsigned long qlim = 0; sigemptyset(&pending); @@ -291,7 +280,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) collect_sigign_sigcatch(p, &ignored, &caught); num_threads = get_nr_threads(p); rcu_read_lock(); /* FIXME: is this correct? */ - qsize = atomic_read(&__task_cred(p)->user->sigpending); + qsize = get_rlimit_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING); rcu_read_unlock(); qlim = task_rlimit(p, RLIMIT_SIGPENDING); unlock_task_sighand(p, &flags); @@ -312,13 +301,8 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) static void render_cap_t(struct seq_file *m, const char *header, kernel_cap_t *a) { - unsigned __capi; - seq_puts(m, header); - CAP_FOR_EACH_U32(__capi) { - seq_printf(m, "%08x", - a->cap[CAP_LAST_U32 - __capi]); - } + seq_put_hex_ll(m, NULL, a->val, 16); seq_putc(m, '\n'); } @@ -349,7 +333,63 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p)); #ifdef CONFIG_SECCOMP seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); +#ifdef CONFIG_SECCOMP_FILTER + seq_put_decimal_ull(m, "\nSeccomp_filters:\t", + atomic_read(&p->seccomp.filter_count)); +#endif #endif + seq_puts(m, "\nSpeculation_Store_Bypass:\t"); + switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { + case -EINVAL: + seq_puts(m, "unknown"); + break; + case PR_SPEC_NOT_AFFECTED: + seq_puts(m, "not vulnerable"); + break; + case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: + seq_puts(m, "thread force mitigated"); + break; + case PR_SPEC_PRCTL | PR_SPEC_DISABLE: + seq_puts(m, "thread mitigated"); + break; + case PR_SPEC_PRCTL | PR_SPEC_ENABLE: + seq_puts(m, "thread vulnerable"); + break; + case PR_SPEC_DISABLE: + seq_puts(m, "globally mitigated"); + break; + default: + seq_puts(m, "vulnerable"); + break; + } + + seq_puts(m, "\nSpeculationIndirectBranch:\t"); + switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_INDIRECT_BRANCH)) { + case -EINVAL: + seq_puts(m, "unsupported"); + break; + case PR_SPEC_NOT_AFFECTED: + seq_puts(m, "not affected"); + break; + case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: + seq_puts(m, "conditional force disabled"); + break; + case PR_SPEC_PRCTL | PR_SPEC_DISABLE: + seq_puts(m, "conditional disabled"); + break; + case PR_SPEC_PRCTL | PR_SPEC_ENABLE: + seq_puts(m, "conditional enabled"); + break; + case PR_SPEC_ENABLE: + seq_puts(m, "always enabled"); + break; + case PR_SPEC_DISABLE: + seq_puts(m, "always disabled"); + break; + default: + seq_puts(m, "unknown"); + break; + } seq_putc(m, '\n'); } @@ -364,9 +404,34 @@ static inline void task_context_switch_counts(struct seq_file *m, static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) { seq_printf(m, "Cpus_allowed:\t%*pb\n", - cpumask_pr_args(&task->cpus_allowed)); + cpumask_pr_args(&task->cpus_mask)); seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", - cpumask_pr_args(&task->cpus_allowed)); + cpumask_pr_args(&task->cpus_mask)); +} + +static inline void task_core_dumping(struct seq_file *m, struct task_struct *task) +{ + seq_put_decimal_ull(m, "CoreDumping:\t", !!task->signal->core_state); + seq_putc(m, '\n'); +} + +static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm) +{ + bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE); + + if (thp_enabled) + thp_enabled = !mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); + seq_printf(m, "THP_enabled:\t%d\n", thp_enabled); +} + +static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm) +{ + seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm)); +} + +__weak void arch_proc_pid_thread_features(struct seq_file *m, + struct task_struct *task) +{ } int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, @@ -374,11 +439,17 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, { struct mm_struct *mm = get_task_mm(task); - task_name(m, task); + seq_puts(m, "Name:\t"); + proc_task_name(m, task, true); + seq_putc(m, '\n'); + task_state(m, ns, pid, task); if (mm) { task_mem(m, mm); + task_core_dumping(m, task); + task_thp_status(m, mm); + task_untag_mask(m, mm); mmput(mm); } task_sig(m, task); @@ -387,6 +458,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); + arch_proc_pid_thread_features(m, task); return 0; } @@ -403,13 +475,12 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, int permitted; struct mm_struct *mm; unsigned long long start_time; - unsigned long cmin_flt = 0, cmaj_flt = 0; - unsigned long min_flt = 0, maj_flt = 0; - u64 cutime, cstime, utime, stime; - u64 cgtime, gtime; + unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt; + u64 cutime, cstime, cgtime, utime, stime, gtime; unsigned long rsslim = 0; - char tcomm[sizeof(task->comm)]; unsigned long flags; + int exit_code = task->exit_code; + struct signal_struct *sig = task->signal; state = *get_task_state(task); vsize = eip = esp = 0; @@ -421,19 +492,24 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, * esp and eip are intentionally zeroed out. There is no * non-racy way to read them without freezing the task. * Programs that need reliable values can use ptrace(2). + * + * The only exception is if the task is core dumping because + * a program is not able to use ptrace(2) in that case. It is + * safe because the task has stopped executing permanently. */ + if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE|PF_POSTCOREDUMP))) { + if (try_get_task_stack(task)) { + eip = KSTK_EIP(task); + esp = KSTK_ESP(task); + put_task_stack(task); + } + } } - get_task_comm(tcomm, task); - sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = utime = stime = 0; - cgtime = gtime = 0; if (lock_task_sighand(task, &flags)) { - struct signal_struct *sig = task->signal; - if (sig->tty) { struct pid *pgrp = tty_get_pgrp(sig->tty); tty_pgrp = pid_nr_ns(pgrp, ns); @@ -444,26 +520,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, num_threads = get_nr_threads(task); collect_sigign_sigcatch(task, &sigign, &sigcatch); - cmin_flt = sig->cmin_flt; - cmaj_flt = sig->cmaj_flt; - cutime = sig->cutime; - cstime = sig->cstime; - cgtime = sig->cgtime; - rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); + rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); - /* add up live thread stats at the group level */ if (whole) { - struct task_struct *t = task; - do { - min_flt += t->min_flt; - maj_flt += t->maj_flt; - gtime += task_gtime(t); - } while_each_thread(task, t); - - min_flt += sig->min_flt; - maj_flt += sig->maj_flt; - thread_group_cputime_adjusted(task, &utime, &stime); - gtime += sig->gtime; + if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED)) + exit_code = sig->group_exit_code; } sid = task_session_nr_ns(task, ns); @@ -474,11 +535,38 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, } if (permitted && (!whole || num_threads < 2)) - wchan = get_wchan(task); - if (!whole) { + wchan = !task_is_running(task); + + scoped_guard(rcu) { + scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { + cmin_flt = sig->cmin_flt; + cmaj_flt = sig->cmaj_flt; + cutime = sig->cutime; + cstime = sig->cstime; + cgtime = sig->cgtime; + + if (whole) { + struct task_struct *t; + + min_flt = sig->min_flt; + maj_flt = sig->maj_flt; + gtime = sig->gtime; + + __for_each_thread(sig, t) { + min_flt += t->min_flt; + maj_flt += t->maj_flt; + gtime += task_gtime(t); + } + } + } + } + + if (whole) { + thread_group_cputime_adjusted(task, &utime, &stime); + } else { + task_cputime_adjusted(task, &utime, &stime); min_flt = task->min_flt; maj_flt = task->maj_flt; - task_cputime_adjusted(task, &utime, &stime); gtime = task_gtime(task); } @@ -487,10 +575,15 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, priority = task_prio(task); nice = task_nice(task); - /* convert nsec -> ticks */ - start_time = nsec_to_clock_t(task->real_start_time); + /* apply timens offset for boottime and convert nsec -> ticks */ + start_time = + nsec_to_clock_t(timens_add_boottime_ns(task->start_boottime)); - seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); + seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns)); + seq_puts(m, " ("); + proc_task_name(m, task, false); + seq_puts(m, ") "); + seq_putc(m, state); seq_put_decimal_ll(m, " ", ppid); seq_put_decimal_ll(m, " ", pgid); seq_put_decimal_ll(m, " ", sid); @@ -534,10 +627,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, * * This works with older implementations of procps as well. */ - if (wchan) - seq_puts(m, " 1"); - else - seq_puts(m, " 0"); + seq_put_decimal_ull(m, " ", wchan); seq_put_decimal_ull(m, " ", 0); seq_put_decimal_ull(m, " ", 0); @@ -561,7 +651,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_puts(m, " 0 0 0 0 0 0 0"); if (permitted) - seq_put_decimal_ll(m, " ", task->exit_code); + seq_put_decimal_ll(m, " ", exit_code); else seq_puts(m, " 0"); @@ -586,28 +676,35 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0; struct mm_struct *mm = get_task_mm(task); if (mm) { + unsigned long size; + unsigned long resident = 0; + unsigned long shared = 0; + unsigned long text = 0; + unsigned long data = 0; + size = task_statm(mm, &shared, &text, &data, &resident); mmput(mm); - } - /* - * For quick read, open code by putting numbers directly - * expected format is - * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", - * size, resident, shared, text, data); - */ - seq_put_decimal_ull(m, "", size); - seq_put_decimal_ull(m, " ", resident); - seq_put_decimal_ull(m, " ", shared); - seq_put_decimal_ull(m, " ", text); - seq_put_decimal_ull(m, " ", 0); - seq_put_decimal_ull(m, " ", data); - seq_put_decimal_ull(m, " ", 0); - seq_putc(m, '\n'); + /* + * For quick read, open code by putting numbers directly + * expected format is + * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", + * size, resident, shared, text, data); + */ + seq_put_decimal_ull(m, "", size); + seq_put_decimal_ull(m, " ", resident); + seq_put_decimal_ull(m, " ", shared); + seq_put_decimal_ull(m, " ", text); + seq_put_decimal_ull(m, " ", 0); + seq_put_decimal_ull(m, " ", data); + seq_put_decimal_ull(m, " ", 0); + seq_putc(m, '\n'); + } else { + seq_write(m, "0 0 0 0 0 0 0\n", 14); + } return 0; } @@ -670,25 +767,22 @@ out: static int children_seq_show(struct seq_file *seq, void *v) { - struct inode *inode = seq->private; - pid_t pid; - - pid = pid_nr_ns(v, inode->i_sb->s_fs_info); - seq_printf(seq, "%d ", pid); + struct inode *inode = file_inode(seq->file); + seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode->i_sb))); return 0; } static void *children_seq_start(struct seq_file *seq, loff_t *pos) { - return get_children_pid(seq->private, NULL, *pos); + return get_children_pid(file_inode(seq->file), NULL, *pos); } static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct pid *pid; - pid = get_children_pid(seq->private, v, *pos + 1); + pid = get_children_pid(file_inode(seq->file), v, *pos + 1); put_pid(v); ++*pos; @@ -709,29 +803,13 @@ static const struct seq_operations children_seq_ops = { static int children_seq_open(struct inode *inode, struct file *file) { - struct seq_file *m; - int ret; - - ret = seq_open(file, &children_seq_ops); - if (ret) - return ret; - - m = file->private_data; - m->private = inode; - - return ret; -} - -int children_seq_release(struct inode *inode, struct file *file) -{ - seq_release(inode, file); - return 0; + return seq_open(file, &children_seq_ops); } const struct file_operations proc_tid_children_operations = { .open = children_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = children_seq_release, + .release = seq_release, }; #endif /* CONFIG_PROC_CHILDREN */ diff --git a/fs/proc/base.c b/fs/proc/base.c index 719c2e943ea1..4eec684baca9 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/base.c * @@ -57,7 +58,7 @@ #include <linux/init.h> #include <linux/capability.h> #include <linux/file.h> -#include <linux/fdtable.h> +#include <linux/generic-radix-tree.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/namei.h> @@ -72,8 +73,8 @@ #include <linux/mount.h> #include <linux/security.h> #include <linux/ptrace.h> -#include <linux/tracehook.h> #include <linux/printk.h> +#include <linux/cache.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/audit.h> @@ -83,6 +84,7 @@ #include <linux/elf.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> +#include <linux/fs_parser.h> #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> @@ -90,15 +92,18 @@ #include <linux/sched/coredump.h> #include <linux/sched/debug.h> #include <linux/sched/stat.h> -#include <linux/flex_array.h> #include <linux/posix-timers.h> -#ifdef CONFIG_HARDWALL -#include <asm/hardwall.h> -#endif +#include <linux/time_namespace.h> +#include <linux/resctrl.h> +#include <linux/cn_proc.h> +#include <linux/ksm.h> +#include <uapi/linux/lsm.h> #include <trace/events/oom.h> #include "internal.h" #include "fd.h" +#include "../../lib/kstrtox.h" + /* NOTE: * Implementing inode permission operations in /proc is almost * certainly an error. Permission checks need to happen during @@ -109,8 +114,42 @@ * in /proc for a task before it execs a suid executable. */ -static u8 nlink_tid; -static u8 nlink_tgid; +static u8 nlink_tid __ro_after_init; +static u8 nlink_tgid __ro_after_init; + +enum proc_mem_force { + PROC_MEM_FORCE_ALWAYS, + PROC_MEM_FORCE_PTRACE, + PROC_MEM_FORCE_NEVER +}; + +static enum proc_mem_force proc_mem_force_override __ro_after_init = + IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER : + IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE : + PROC_MEM_FORCE_ALWAYS; + +static const struct constant_table proc_mem_force_table[] __initconst = { + { "always", PROC_MEM_FORCE_ALWAYS }, + { "ptrace", PROC_MEM_FORCE_PTRACE }, + { "never", PROC_MEM_FORCE_NEVER }, + { } +}; + +static int __init early_proc_mem_force_override(char *buf) +{ + if (!buf) + return -EINVAL; + + /* + * lookup_constant() defaults to proc_mem_force_override to preseve + * the initial Kconfig choice in case an invalid param gets passed. + */ + proc_mem_force_override = lookup_constant(proc_mem_force_table, + buf, proc_mem_force_override); + + return 0; +} +early_param("proc_mem.force_override", early_proc_mem_force_override); struct pid_entry { const char *name; @@ -139,9 +178,13 @@ struct pid_entry { #define REG(NAME, MODE, fops) \ NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) #define ONE(NAME, MODE, show) \ - NOD(NAME, (S_IFREG|(MODE)), \ + NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_single_file_operations, \ { .proc_show = show } ) +#define ATTR(LSMID, NAME, MODE) \ + NOD(NAME, (S_IFREG|(MODE)), \ + NULL, &proc_pid_attr_operations, \ + { .lsmid = LSMID }) /* * Count the number of hardlinks for the pid_entry table, excluding the . @@ -204,171 +247,165 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, - size_t _count, loff_t *pos) +/* + * If the user used setproctitle(), we just get the string from + * user space at arg_start, and limit it to a maximum of one page. + */ +static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf, + size_t count, unsigned long pos, + unsigned long arg_start) { - struct task_struct *tsk; - struct mm_struct *mm; char *page; - unsigned long count = _count; - unsigned long arg_start, arg_end, env_start, env_end; - unsigned long len1, len2, len; - unsigned long p; - char c; - ssize_t rv; - - BUG_ON(*pos < 0); + int ret, got; - tsk = get_proc_task(file_inode(file)); - if (!tsk) - return -ESRCH; - mm = get_task_mm(tsk); - put_task_struct(tsk); - if (!mm) + if (pos >= PAGE_SIZE) return 0; - /* Check if process spawned far enough to have cmdline. */ - if (!mm->env_end) { - rv = 0; - goto out_mmput; - } - page = (char *)__get_free_page(GFP_TEMPORARY); - if (!page) { - rv = -ENOMEM; - goto out_mmput; + page = (char *)__get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + ret = 0; + got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON); + if (got > 0) { + int len = strnlen(page, got); + + /* Include the NUL character if it was found */ + if (len < got) + len++; + + if (len > pos) { + len -= pos; + if (len > count) + len = count; + len -= copy_to_user(buf, page+pos, len); + if (!len) + len = -EFAULT; + ret = len; + } } + free_page((unsigned long)page); + return ret; +} + +static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned long arg_start, arg_end, env_start, env_end; + unsigned long pos, len; + char *page, c; - down_read(&mm->mmap_sem); + /* Check if process spawned far enough to have cmdline. */ + if (!mm->env_end) + return 0; + + spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; - up_read(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); + + if (arg_start >= arg_end) + return 0; - BUG_ON(arg_start > arg_end); - BUG_ON(env_start > env_end); + /* + * We allow setproctitle() to overwrite the argument + * strings, and overflow past the original end. But + * only when it overflows into the environment area. + */ + if (env_start != arg_end || env_end < env_start) + env_start = env_end = arg_end; + len = env_end - arg_start; - len1 = arg_end - arg_start; - len2 = env_end - env_start; + /* We're not going to care if "*ppos" has high bits set */ + pos = *ppos; + if (pos >= len) + return 0; + if (count > len - pos) + count = len - pos; + if (!count) + return 0; - /* Empty ARGV. */ - if (len1 == 0) { - rv = 0; - goto out_free_page; - } /* - * Inherently racy -- command line shares address space - * with code and data. + * Magical special case: if the argv[] end byte is not + * zero, the user has overwritten it with setproctitle(3). + * + * Possible future enhancement: do this only once when + * pos is 0, and set a flag in the 'struct file'. */ - rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0); - if (rv <= 0) - goto out_free_page; - - rv = 0; - - if (c == '\0') { - /* Command line (set of strings) occupies whole ARGV. */ - if (len1 <= *pos) - goto out_free_page; - - p = arg_start + *pos; - len = len1 - *pos; - while (count > 0 && len > 0) { - unsigned int _count; - int nr_read; - - _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); - if (nr_read < 0) - rv = nr_read; - if (nr_read <= 0) - goto out_free_page; - - if (copy_to_user(buf, page, nr_read)) { - rv = -EFAULT; - goto out_free_page; - } + if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c) + return get_mm_proctitle(mm, buf, count, pos, arg_start); - p += nr_read; - len -= nr_read; - buf += nr_read; - count -= nr_read; - rv += nr_read; - } - } else { - /* - * Command line (1 string) occupies ARGV and - * extends into ENVP. - */ - struct { - unsigned long p; - unsigned long len; - } cmdline[2] = { - { .p = arg_start, .len = len1 }, - { .p = env_start, .len = len2 }, - }; - loff_t pos1 = *pos; - unsigned int i; - - i = 0; - while (i < 2 && pos1 >= cmdline[i].len) { - pos1 -= cmdline[i].len; - i++; - } - while (i < 2) { - p = cmdline[i].p + pos1; - len = cmdline[i].len - pos1; - while (count > 0 && len > 0) { - unsigned int _count, l; - int nr_read; - bool final; - - _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); - if (nr_read < 0) - rv = nr_read; - if (nr_read <= 0) - goto out_free_page; - - /* - * Command line can be shorter than whole ARGV - * even if last "marker" byte says it is not. - */ - final = false; - l = strnlen(page, nr_read); - if (l < nr_read) { - nr_read = l; - final = true; - } - - if (copy_to_user(buf, page, nr_read)) { - rv = -EFAULT; - goto out_free_page; - } - - p += nr_read; - len -= nr_read; - buf += nr_read; - count -= nr_read; - rv += nr_read; - - if (final) - goto out_free_page; - } + /* + * For the non-setproctitle() case we limit things strictly + * to the [arg_start, arg_end[ range. + */ + pos += arg_start; + if (pos < arg_start || pos >= arg_end) + return 0; + if (count > arg_end - pos) + count = arg_end - pos; + + page = (char *)__get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + len = 0; + while (count) { + int got; + size_t size = min_t(size_t, PAGE_SIZE, count); - /* Only first chunk can be read partially. */ - pos1 = 0; - i++; + got = access_remote_vm(mm, pos, page, size, FOLL_ANON); + if (got <= 0) + break; + got -= copy_to_user(buf, page, got); + if (unlikely(!got)) { + if (!len) + len = -EFAULT; + break; } + pos += got; + buf += got; + len += got; + count -= got; } -out_free_page: free_page((unsigned long)page); -out_mmput: + return len; +} + +static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf, + size_t count, loff_t *pos) +{ + struct mm_struct *mm; + ssize_t ret; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + ret = get_mm_cmdline(mm, buf, count, pos); mmput(mm); - if (rv > 0) - *pos += rv; - return rv; + return ret; +} + +static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, + size_t count, loff_t *pos) +{ + struct task_struct *tsk; + ssize_t ret; + + BUG_ON(*pos < 0); + + tsk = get_proc_task(file_inode(file)); + if (!tsk) + return -ESRCH; + ret = get_task_cmdline(tsk, buf, count, pos); + put_task_struct(tsk); + if (ret > 0) + *pos += ret; + return ret; } static const struct file_operations proc_pid_cmdline_ops = { @@ -379,7 +416,7 @@ static const struct file_operations proc_pid_cmdline_ops = { #ifdef CONFIG_KALLSYMS /* * Provides a wchan file via kallsyms in a proper one-value-per-file format. - * Returns the resolved symbol. If that fails, simply return the address. + * Returns the resolved symbol to user space. */ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) @@ -387,25 +424,28 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, unsigned long wchan; char symname[KSYM_NAME_LEN]; - wchan = get_wchan(task); + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + goto print0; - if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS) - && !lookup_symbol_name(wchan, symname)) - seq_printf(m, "%s", symname); - else - seq_putc(m, '0'); + wchan = get_wchan(task); + if (wchan && !lookup_symbol_name(wchan, symname)) { + seq_puts(m, symname); + return 0; + } +print0: + seq_putc(m, '0'); return 0; } #endif /* CONFIG_KALLSYMS */ static int lock_trace(struct task_struct *task) { - int err = mutex_lock_killable(&task->signal->cred_guard_mutex); + int err = down_read_killable(&task->signal->exec_update_lock); if (err) return err; if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { - mutex_unlock(&task->signal->cred_guard_mutex); + up_read(&task->signal->exec_update_lock); return -EPERM; } return 0; @@ -413,7 +453,7 @@ static int lock_trace(struct task_struct *task) static void unlock_trace(struct task_struct *task) { - mutex_unlock(&task->signal->cred_guard_mutex); + up_read(&task->signal->exec_update_lock); } #ifdef CONFIG_STACKTRACE @@ -423,28 +463,39 @@ static void unlock_trace(struct task_struct *task) static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - struct stack_trace trace; unsigned long *entries; int err; - int i; - entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL); + /* + * The ability to racily run the kernel stack unwinder on a running task + * and then observe the unwinder output is scary; while it is useful for + * debugging kernel issues, it can also allow an attacker to leak kernel + * stack contents. + * Doing this in a manner that is at least safe from races would require + * some work to ensure that the remote task can not be scheduled; and + * even then, this would still expose the unwinder as local attack + * surface. + * Therefore, this interface is restricted to root. + */ + if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) + return -EACCES; + + entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries), + GFP_KERNEL); if (!entries) return -ENOMEM; - trace.nr_entries = 0; - trace.max_entries = MAX_STACK_TRACE_DEPTH; - trace.entries = entries; - trace.skip = 0; - err = lock_trace(task); if (!err) { - save_stack_trace_tsk(task, &trace); + unsigned int i, nr_entries; + + nr_entries = stack_trace_save_tsk(task, entries, + MAX_STACK_TRACE_DEPTH, 0); - for (i = 0; i < trace.nr_entries; i++) { - seq_printf(m, "[<%pK>] %pB\n", - (void *)entries[i], (void *)entries[i]); + for (i = 0; i < nr_entries; i++) { + seq_printf(m, "[<0>] %pB\n", (void *)entries[i]); } + unlock_trace(task); } kfree(entries); @@ -461,7 +512,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { if (unlikely(!sched_info_on())) - seq_printf(m, "0 0 0\n"); + seq_puts(m, "0 0 0\n"); else seq_printf(m, "%llu %llu %lu\n", (unsigned long long)task->se.sum_exec_runtime, @@ -482,7 +533,7 @@ static int lstats_show_proc(struct seq_file *m, void *v) if (!task) return -ESRCH; seq_puts(m, "Latency Top version : v0.1\n"); - for (i = 0; i < 32; i++) { + for (i = 0; i < LT_SAVECOUNT; i++) { struct latency_record *lr = &task->latency_record[i]; if (lr->backtrace[0]) { int q; @@ -490,10 +541,9 @@ static int lstats_show_proc(struct seq_file *m, void *v) lr->count, lr->time, lr->max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { unsigned long bt = lr->backtrace[q]; + if (!bt) break; - if (bt == ULONG_MAX) - break; seq_printf(m, " %ps", (void *)bt); } seq_putc(m, '\n'); @@ -516,7 +566,7 @@ static ssize_t lstats_write(struct file *file, const char __user *buf, if (!task) return -ESRCH; - clear_all_latency_tracing(task); + clear_tsk_latency_tracing(task); put_task_struct(task); return count; @@ -535,11 +585,19 @@ static const struct file_operations proc_lstats_operations = { static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - unsigned long totalpages = totalram_pages + total_swap_pages; + unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; + long badness; + + badness = oom_badness(task, totalpages); + /* + * Special case OOM_SCORE_ADJ_MIN for all others scale the + * badness value into [0, 2000] range which we have been + * exporting for a long time so userspace might depend on it. + */ + if (badness != LONG_MIN) + points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3; - points = oom_badness(task, NULL, NULL, totalpages) * - 1000 / totalpages; seq_printf(m, "%lu\n", points); return 0; @@ -586,8 +644,10 @@ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, /* * print the file header */ - seq_printf(m, "%-25s %-20s %-20s %-10s\n", - "Limit", "Soft Limit", "Hard Limit", "Units"); + seq_puts(m, "Limit " + "Soft Limit " + "Hard Limit " + "Units \n"); for (i = 0; i < RLIM_NLIMITS; i++) { if (rlim[i].rlim_cur == RLIM_INFINITY) @@ -615,24 +675,25 @@ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - long nr; - unsigned long args[6], sp, pc; + struct syscall_info info; + u64 *args = &info.data.args[0]; int res; res = lock_trace(task); if (res) return res; - if (task_current_syscall(task, &nr, args, 6, &sp, &pc)) + if (task_current_syscall(task, &info)) seq_puts(m, "running\n"); - else if (nr < 0) - seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc); + else if (info.data.nr < 0) + seq_printf(m, "%d 0x%llx 0x%llx\n", + info.data.nr, info.sp, info.data.instruction_pointer); else seq_printf(m, - "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", - nr, + "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n", + info.data.nr, args[0], args[1], args[2], args[3], args[4], args[5], - sp, pc); + info.sp, info.data.instruction_pointer); unlock_trace(task); return 0; @@ -644,10 +705,10 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, /************************************************************************/ /* permission checks */ -static int proc_fd_access_allowed(struct inode *inode) +static bool proc_fd_access_allowed(struct inode *inode) { struct task_struct *task; - int allowed = 0; + bool allowed = false; /* Allow access to a task's file descriptors if it is us or we * may use ptrace attach to the process and find out that * information. @@ -660,7 +721,8 @@ static int proc_fd_access_allowed(struct inode *inode) return allowed; } -int proc_setattr(struct dentry *dentry, struct iattr *attr) +int proc_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) { int error; struct inode *inode = d_inode(dentry); @@ -668,12 +730,11 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_MODE) return -EPERM; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; - setattr_copy(inode, attr); - mark_inode_dirty(inode); + setattr_copy(&nop_mnt_idmap, inode, attr); return 0; } @@ -681,32 +742,41 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr) * May current process learn task's sched/cmdline info (for hide_pid_min=1) * or euid/egid (for hide_pid_min=2)? */ -static bool has_pid_permissions(struct pid_namespace *pid, +static bool has_pid_permissions(struct proc_fs_info *fs_info, struct task_struct *task, - int hide_pid_min) + enum proc_hidepid hide_pid_min) { - if (pid->hide_pid < hide_pid_min) + /* + * If 'hidpid' mount option is set force a ptrace check, + * we indicate that we are using a filesystem syscall + * by passing PTRACE_MODE_READ_FSCREDS + */ + if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) + return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); + + if (fs_info->hide_pid < hide_pid_min) return true; - if (in_group_p(pid->pid_gid)) + if (in_group_p(fs_info->pid_gid)) return true; return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); } -static int proc_pid_permission(struct inode *inode, int mask) +static int proc_pid_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask) { - struct pid_namespace *pid = inode->i_sb->s_fs_info; + struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; bool has_perms; task = get_proc_task(inode); if (!task) return -ESRCH; - has_perms = has_pid_permissions(pid, task, HIDEPID_NO_ACCESS); + has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS); put_task_struct(task); if (!has_perms) { - if (pid->hide_pid == HIDEPID_INVISIBLE) { + if (fs_info->hide_pid == HIDEPID_INVISIBLE) { /* * Let's make getdents(), stat(), and open() * consistent with each other. If a process @@ -718,7 +788,7 @@ static int proc_pid_permission(struct inode *inode, int mask) return -EPERM; } - return generic_permission(inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } @@ -730,13 +800,11 @@ static const struct inode_operations proc_def_inode_operations = { static int proc_single_show(struct seq_file *m, void *v) { struct inode *inode = m->private; - struct pid_namespace *ns; - struct pid *pid; + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); + struct pid *pid = proc_pid(inode); struct task_struct *task; int ret; - ns = inode->i_sb->s_fs_info; - pid = proc_pid(inode); task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; @@ -759,23 +827,31 @@ static const struct file_operations proc_single_file_operations = { .release = single_release, }; - +/* + * proc_mem_open() can return errno, NULL or mm_struct*. + * + * - Returns NULL if the task has no mm (PF_KTHREAD or PF_EXITING) + * - Returns mm_struct* on success + * - Returns error code on failure + */ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) { struct task_struct *task = get_proc_task(inode); - struct mm_struct *mm = ERR_PTR(-ESRCH); + struct mm_struct *mm; - if (task) { - mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); - put_task_struct(task); + if (!task) + return ERR_PTR(-ESRCH); - if (!IS_ERR_OR_NULL(mm)) { - /* ensure this mm_struct can't be freed */ - mmgrab(mm); - /* but do not pin its memory */ - mmput(mm); - } - } + mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); + put_task_struct(task); + + if (IS_ERR(mm)) + return mm == ERR_PTR(-ESRCH) ? NULL : mm; + + /* ensure this mm_struct can't be freed */ + mmgrab(mm); + /* but do not pin its memory */ + mmput(mm); return mm; } @@ -784,8 +860,8 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) { struct mm_struct *mm = proc_mem_open(inode, mode); - if (IS_ERR(mm)) - return PTR_ERR(mm); + if (IS_ERR_OR_NULL(mm)) + return mm ? PTR_ERR(mm) : -ESRCH; file->private_data = mm; return 0; @@ -793,12 +869,31 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) static int mem_open(struct inode *inode, struct file *file) { - int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH); - - /* OK to pass negative loff_t, we can catch out-of-range */ - file->f_mode |= FMODE_UNSIGNED_OFFSET; + if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET))) + return -EINVAL; + return __mem_open(inode, file, PTRACE_MODE_ATTACH); +} - return ret; +static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm) +{ + struct task_struct *task; + bool ptrace_active = false; + + switch (proc_mem_force_override) { + case PROC_MEM_FORCE_NEVER: + return false; + case PROC_MEM_FORCE_PTRACE: + task = get_proc_task(file_inode(file)); + if (task) { + ptrace_active = READ_ONCE(task->ptrace) && + READ_ONCE(task->mm) == mm && + READ_ONCE(task->parent) == current; + put_task_struct(task); + } + return ptrace_active; + default: + return true; + } } static ssize_t mem_rw(struct file *file, char __user *buf, @@ -813,7 +908,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!mm) return 0; - page = (char *)__get_free_page(GFP_TEMPORARY); + page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; @@ -821,10 +916,12 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!mmget_not_zero(mm)) goto free; - flags = FOLL_FORCE | (write ? FOLL_WRITE : 0); + flags = write ? FOLL_WRITE : 0; + if (proc_mem_foll_force(file, mm)) + flags |= FOLL_FORCE; while (count > 0) { - int this_len = min_t(int, count, PAGE_SIZE); + size_t this_len = min_t(size_t, count, PAGE_SIZE); if (write && copy_from_user(page, buf, this_len)) { copied = -EFAULT; @@ -898,6 +995,7 @@ static const struct file_operations proc_mem_operations = { .write = mem_write, .open = mem_open, .release = mem_release, + .fop_flags = FOP_UNSIGNED_OFFSET, }; static int environ_open(struct inode *inode, struct file *file) @@ -918,7 +1016,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, if (!mm || !mm->env_end) return 0; - page = (char *)__get_free_page(GFP_TEMPORARY); + page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; @@ -926,10 +1024,10 @@ static ssize_t environ_read(struct file *file, char __user *buf, if (!mmget_not_zero(mm)) goto free; - down_read(&mm->mmap_sem); + spin_lock(&mm->arg_lock); env_start = mm->env_start; env_end = mm->env_end; - up_read(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); while (count > 0) { size_t this_len, max_len; @@ -943,7 +1041,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, max_len = min_t(size_t, PAGE_SIZE, count); this_len = min(max_len, this_len); - retval = access_remote_vm(mm, (env_start + src), page, this_len, 0); + retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON); if (retval <= 0) { ret = retval; @@ -1018,13 +1116,14 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / OOM_SCORE_ADJ_MAX; put_task_struct(task); + if (oom_adj > OOM_ADJUST_MAX) + oom_adj = OOM_ADJUST_MAX; len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) { - static DEFINE_MUTEX(oom_adj_mutex); struct mm_struct *mm = NULL; struct task_struct *task; int err = 0; @@ -1064,7 +1163,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) struct task_struct *p = find_lock_task_mm(task); if (p) { - if (atomic_read(&p->mm->mm_users) > 1) { + if (mm_flags_test(MMF_MULTIPROCESS, p->mm)) { mm = p->mm; mmgrab(mm); } @@ -1091,10 +1190,6 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) task_lock(p); if (!p->vfork_done && process_shares_mm(p, mm)) { - pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n", - task_pid_nr(p), p->comm, - p->signal->oom_score_adj, oom_adj, - task_pid_nr(task), task->comm); p->signal->oom_score_adj = oom_adj; if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) p->signal->oom_score_adj_min = (short)oom_adj; @@ -1123,11 +1218,10 @@ err_unlock: static ssize_t oom_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; int oom_adj; int err; - memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) { @@ -1183,11 +1277,10 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; int oom_score_adj; int err; - memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) { @@ -1215,7 +1308,7 @@ static const struct file_operations proc_oom_score_adj_operations = { .llseek = default_llseek, }; -#ifdef CONFIG_AUDITSYSCALL +#ifdef CONFIG_AUDIT #define TMPBUFLEN 11 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) @@ -1242,6 +1335,10 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, kuid_t kloginuid; int rv; + /* Don't let kthreads write their own loginuid */ + if (current->flags & PF_KTHREAD) + return -EPERM; + rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { rcu_read_unlock(); @@ -1324,13 +1421,13 @@ static ssize_t proc_fault_inject_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; int make_it_fail; int rv; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) @@ -1370,7 +1467,7 @@ static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; - WRITE_ONCE(task->fail_nth, n); + task->fail_nth = n; put_task_struct(task); return count; @@ -1386,12 +1483,9 @@ static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; - len = snprintf(numbuf, sizeof(numbuf), "%u\n", - READ_ONCE(task->fail_nth)); - len = simple_read_from_buffer(buf, count, ppos, numbuf, len); + len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth); put_task_struct(task); - - return len; + return simple_read_from_buffer(buf, count, ppos, numbuf, len); } static const struct file_operations proc_fail_nth_operations = { @@ -1401,19 +1495,19 @@ static const struct file_operations proc_fail_nth_operations = { #endif -#ifdef CONFIG_SCHED_DEBUG /* * Print out various scheduling related per-task fields: */ static int sched_show(struct seq_file *m, void *v) { struct inode *inode = m->private; + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; - proc_sched_show_task(p, m); + proc_sched_show_task(p, ns, m); put_task_struct(p); @@ -1450,8 +1544,6 @@ static const struct file_operations proc_pid_sched_operations = { .release = single_release, }; -#endif - #ifdef CONFIG_SCHED_AUTOGROUP /* * Print out autogroup related information: @@ -1477,11 +1569,10 @@ sched_autogroup_write(struct file *file, const char __user *buf, { struct inode *inode = file_inode(file); struct task_struct *p; - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; int nice; int err; - memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) @@ -1527,15 +1618,116 @@ static const struct file_operations proc_pid_sched_autogroup_operations = { #endif /* CONFIG_SCHED_AUTOGROUP */ +#ifdef CONFIG_TIME_NS +static int timens_offsets_show(struct seq_file *m, void *v) +{ + struct task_struct *p; + + p = get_proc_task(file_inode(m->file)); + if (!p) + return -ESRCH; + proc_timens_show_offsets(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t timens_offsets_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct inode *inode = file_inode(file); + struct proc_timens_offset offsets[2]; + char *kbuf = NULL, *pos, *next_line; + struct task_struct *p; + int ret, noffsets; + + /* Only allow < page size writes at the beginning of the file */ + if ((*ppos != 0) || (count >= PAGE_SIZE)) + return -EINVAL; + + /* Slurp in the user data */ + kbuf = memdup_user_nul(buf, count); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + /* Parse the user data */ + ret = -EINVAL; + noffsets = 0; + for (pos = kbuf; pos; pos = next_line) { + struct proc_timens_offset *off = &offsets[noffsets]; + char clock[10]; + int err; + + /* Find the end of line and ensure we don't look past it */ + next_line = strchr(pos, '\n'); + if (next_line) { + *next_line = '\0'; + next_line++; + if (*next_line == '\0') + next_line = NULL; + } + + err = sscanf(pos, "%9s %lld %lu", clock, + &off->val.tv_sec, &off->val.tv_nsec); + if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC) + goto out; + + clock[sizeof(clock) - 1] = 0; + if (strcmp(clock, "monotonic") == 0 || + strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0) + off->clockid = CLOCK_MONOTONIC; + else if (strcmp(clock, "boottime") == 0 || + strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0) + off->clockid = CLOCK_BOOTTIME; + else + goto out; + + noffsets++; + if (noffsets == ARRAY_SIZE(offsets)) { + if (next_line) + count = next_line - kbuf; + break; + } + } + + ret = -ESRCH; + p = get_proc_task(inode); + if (!p) + goto out; + ret = proc_timens_set_offset(file, p, offsets, noffsets); + put_task_struct(p); + if (ret) + goto out; + + ret = count; +out: + kfree(kbuf); + return ret; +} + +static int timens_offsets_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, timens_offsets_show, inode); +} + +static const struct file_operations proc_timens_offsets_operations = { + .open = timens_offsets_open, + .read = seq_read, + .write = timens_offsets_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_TIME_NS */ + static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; - char buffer[TASK_COMM_LEN]; + char buffer[TASK_COMM_LEN] = {}; const size_t maxlen = sizeof(buffer) - 1; - memset(buffer, 0, sizeof(buffer)); if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) return -EFAULT; @@ -1543,8 +1735,10 @@ static ssize_t comm_write(struct file *file, const char __user *buf, if (!p) return -ESRCH; - if (same_thread_group(current, p)) + if (same_thread_group(current, p)) { set_task_comm(p, buffer); + proc_comm_connector(p); + } else count = -EINVAL; @@ -1562,9 +1756,8 @@ static int comm_show(struct seq_file *m, void *v) if (!p) return -ESRCH; - task_lock(p); - seq_printf(m, "%s\n", p->comm); - task_unlock(p); + proc_task_name(m, p, false); + seq_putc(m, '\n'); put_task_struct(p); @@ -1621,33 +1814,32 @@ static const char *proc_pid_get_link(struct dentry *dentry, if (error) goto out; - nd_jump_link(&path); - return NULL; + error = nd_jump_link(&path); out: return ERR_PTR(error); } -static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) +static int do_proc_readlink(const struct path *path, char __user *buffer, int buflen) { - char *tmp = (char*)__get_free_page(GFP_TEMPORARY); + char *tmp = kmalloc(PATH_MAX, GFP_KERNEL); char *pathname; int len; if (!tmp) return -ENOMEM; - pathname = d_path(path, tmp, PAGE_SIZE); + pathname = d_path(path, tmp, PATH_MAX); len = PTR_ERR(pathname); if (IS_ERR(pathname)) goto out; - len = tmp + PAGE_SIZE - 1 - pathname; + len = tmp + PATH_MAX - 1 - pathname; if (len > buflen) len = buflen; if (copy_to_user(buffer, pathname, len)) len = -EFAULT; out: - free_page((unsigned long)tmp); + kfree(tmp); return len; } @@ -1680,7 +1872,7 @@ const struct inode_operations proc_pid_link_inode_operations = { /* building an inode */ -void task_dump_owner(struct task_struct *task, mode_t mode, +void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t *ruid, kgid_t *rgid) { /* Depending on the state of dumpable compute who should own a @@ -1690,6 +1882,12 @@ void task_dump_owner(struct task_struct *task, mode_t mode, kuid_t uid; kgid_t gid; + if (unlikely(task->flags & PF_KTHREAD)) { + *ruid = GLOBAL_ROOT_UID; + *rgid = GLOBAL_ROOT_GID; + return; + } + /* Default to the tasks effective ownership */ rcu_read_lock(); cred = __task_cred(task); @@ -1732,11 +1930,23 @@ void task_dump_owner(struct task_struct *task, mode_t mode, *rgid = gid; } -struct inode *proc_pid_make_inode(struct super_block * sb, +void proc_pid_evict_inode(struct proc_inode *ei) +{ + struct pid *pid = ei->pid; + + if (S_ISDIR(ei->vfs_inode.i_mode)) { + spin_lock(&pid->lock); + hlist_del_init_rcu(&ei->sibling_inodes); + spin_unlock(&pid->lock); + } +} + +struct inode *proc_pid_make_inode(struct super_block *sb, struct task_struct *task, umode_t mode) { struct inode * inode; struct proc_inode *ei; + struct pid *pid; /* We need a new inode */ @@ -1748,16 +1958,19 @@ struct inode *proc_pid_make_inode(struct super_block * sb, ei = PROC_I(inode); inode->i_mode = mode; inode->i_ino = get_next_ino(); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_op = &proc_def_inode_operations; /* * grab the reference to task. */ - ei->pid = get_task_pid(task, PIDTYPE_PID); - if (!ei->pid) + pid = get_task_pid(task, PIDTYPE_PID); + if (!pid) goto out_unlock; + /* Let the pid remember us for quick removal */ + ei->pid = pid; + task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); @@ -1769,21 +1982,54 @@ out_unlock: return NULL; } -int pid_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +/* + * Generating an inode and adding it into @pid->inodes, so that task will + * invalidate inode's dentry before being released. + * + * This helper is used for creating dir-type entries under '/proc' and + * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>' + * can be released by invalidating '/proc/<tgid>' dentry. + * In theory, dentries under '/proc/<tgid>/task' can also be released by + * invalidating '/proc/<tgid>' dentry, we reserve it to handle single + * thread exiting situation: Any one of threads should invalidate its + * '/proc/<tgid>/task/<pid>' dentry before released. + */ +static struct inode *proc_pid_make_base_inode(struct super_block *sb, + struct task_struct *task, umode_t mode) +{ + struct inode *inode; + struct proc_inode *ei; + struct pid *pid; + + inode = proc_pid_make_inode(sb, task, mode); + if (!inode) + return NULL; + + /* Let proc_flush_pid find this directory inode */ + ei = PROC_I(inode); + pid = ei->pid; + spin_lock(&pid->lock); + hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); + spin_unlock(&pid->lock); + + return inode; +} + +int pid_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); + struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; - struct pid_namespace *pid = path->dentry->d_sb->s_fs_info; - generic_fillattr(inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); - rcu_read_lock(); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; + rcu_read_lock(); task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { - if (!has_pid_permissions(pid, task, HIDEPID_INVISIBLE)) { + if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) { rcu_read_unlock(); /* * This doesn't prevent learning whether PID exists, @@ -1800,34 +2046,41 @@ int pid_getattr(const struct path *path, struct kstat *stat, /* dentry stuff */ /* - * Exceptional case: normally we are not allowed to unhash a busy - * directory. In this case, however, we can do it - no aliasing problems - * due to the way we treat inodes. - * + * Set <pid>/... inode ownership (can change due to setuid(), etc.) + */ +void pid_update_inode(struct task_struct *task, struct inode *inode) +{ + task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid); + + inode->i_mode &= ~(S_ISUID | S_ISGID); + security_task_to_inode(task, inode); +} + +/* * Rewrite the inode's ownerships here because the owning task may have * performed a setuid(), etc. * */ -int pid_revalidate(struct dentry *dentry, unsigned int flags) +static int pid_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; + int ret = 0; - if (flags & LOOKUP_RCU) - return -ECHILD; - - inode = d_inode(dentry); - task = get_proc_task(inode); + rcu_read_lock(); + inode = d_inode_rcu(dentry); + if (!inode) + goto out; + task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { - task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid); - - inode->i_mode &= ~(S_ISUID | S_ISGID); - security_task_to_inode(task, inode); - put_task_struct(task); - return 1; + pid_update_inode(task, inode); + ret = 1; } - return 0; +out: + rcu_read_unlock(); + return ret; } static inline bool proc_inode_is_dead(struct inode *inode) @@ -1859,33 +2112,36 @@ const struct dentry_operations pid_dentry_operations = * file type from dcache entry. * * Since all of the proc inode numbers are dynamically generated, the inode - * numbers do not exist until the inode is cache. This means creating the + * numbers do not exist until the inode is cache. This means creating * the dcache entry in readdir is necessary to keep the inode numbers * reported by readdir in sync with the inode numbers reported * by stat. */ bool proc_fill_cache(struct file *file, struct dir_context *ctx, - const char *name, int len, + const char *name, unsigned int len, instantiate_t instantiate, struct task_struct *task, const void *ptr) { struct dentry *child, *dir = file->f_path.dentry; struct qstr qname = QSTR_INIT(name, len); struct inode *inode; - unsigned type; - ino_t ino; + unsigned type = DT_UNKNOWN; + ino_t ino = 1; - child = d_hash_and_lookup(dir, &qname); + child = try_lookup_noperm(&qname, dir); if (!child) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) goto end_instantiate; if (d_in_lookup(child)) { - int err = instantiate(d_inode(dir), child, task, ptr); + struct dentry *res; + res = instantiate(child, task, ptr); d_lookup_done(child); - if (err < 0) { + if (unlikely(res)) { dput(child); - goto end_instantiate; + child = res; + if (IS_ERR(child)) + goto end_instantiate; } } } @@ -1893,10 +2149,8 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); - return dir_emit(ctx, name, len, ino, type); - end_instantiate: - return dir_emit(ctx, name, len, 1, DT_UNKNOWN); + return dir_emit(ctx, name, len, ino, type); } /* @@ -1906,13 +2160,43 @@ end_instantiate: static int dname_to_vma_addr(struct dentry *dentry, unsigned long *start, unsigned long *end) { - if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) + const char *str = dentry->d_name.name; + unsigned long long sval, eval; + unsigned int len; + + if (str[0] == '0' && str[1] != '-') return -EINVAL; + len = _parse_integer(str, 16, &sval); + if (len & KSTRTOX_OVERFLOW) + return -EINVAL; + if (sval != (unsigned long)sval) + return -EINVAL; + str += len; + + if (*str != '-') + return -EINVAL; + str++; + + if (str[0] == '0' && str[1]) + return -EINVAL; + len = _parse_integer(str, 16, &eval); + if (len & KSTRTOX_OVERFLOW) + return -EINVAL; + if (eval != (unsigned long)eval) + return -EINVAL; + str += len; + + if (*str != '\0') + return -EINVAL; + + *start = sval; + *end = eval; return 0; } -static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) +static int map_files_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { unsigned long vm_start, vm_end; bool exact_vma_exists = false; @@ -1930,13 +2214,16 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) goto out_notask; mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); - if (IS_ERR_OR_NULL(mm)) + if (IS_ERR(mm)) goto out; if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { - down_read(&mm->mmap_sem); - exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); - up_read(&mm->mmap_sem); + status = mmap_read_lock_killable(mm); + if (!status) { + exact_vma_exists = !!find_exact_vma(mm, vm_start, + vm_end); + mmap_read_unlock(mm); + } } mmput(mm); @@ -1982,15 +2269,18 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) if (rc) goto out_mmput; + rc = mmap_read_lock_killable(mm); + if (rc) + goto out_mmput; + rc = -ENOENT; - down_read(&mm->mmap_sem); vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { - *path = vma->vm_file->f_path; + *path = *file_user_path(vma->vm_file); path_get(path); rc = 0; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); out_mmput: mmput(mm); @@ -1999,22 +2289,22 @@ out: } struct map_files_info { + unsigned long start; + unsigned long end; fmode_t mode; - unsigned int len; - unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ }; /* - * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the - * symlinks may be used to bypass permissions on ancestor directories in the - * path to the file in question. + * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due + * to concerns about how the symlinks may be used to bypass permissions on + * ancestor directories in the path to the file in question. */ static const char * proc_map_files_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - if (!capable(CAP_SYS_ADMIN)) + if (!checkpoint_restore_ns_capable(&init_user_ns)) return ERR_PTR(-EPERM); return proc_pid_get_link(dentry, inode, done); @@ -2029,19 +2319,19 @@ static const struct inode_operations proc_map_files_link_inode_operations = { .setattr = proc_setattr, }; -static int -proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, +static struct dentry * +proc_map_files_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { fmode_t mode = (fmode_t)(unsigned long)ptr; struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK | + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | ((mode & FMODE_READ ) ? S_IRUSR : 0) | ((mode & FMODE_WRITE) ? S_IWUSR : 0)); if (!inode) - return -ENOENT; + return ERR_PTR(-ENOENT); ei = PROC_I(inode); ei->op.proc_get_link = map_files_get_link; @@ -2049,10 +2339,8 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, inode->i_op = &proc_map_files_link_inode_operations; inode->i_size = 64; - d_set_d_op(dentry, &tid_map_files_dentry_operations); - d_add(dentry, inode); - - return 0; + return proc_splice_unmountable(inode, dentry, + &tid_map_files_dentry_operations); } static struct dentry *proc_map_files_lookup(struct inode *dir, @@ -2061,19 +2349,19 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, unsigned long vm_start, vm_end; struct vm_area_struct *vma; struct task_struct *task; - int result; + struct dentry *result; struct mm_struct *mm; - result = -ENOENT; + result = ERR_PTR(-ENOENT); task = get_proc_task(dir); if (!task) goto out; - result = -EACCES; + result = ERR_PTR(-EACCES); if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; - result = -ENOENT; + result = ERR_PTR(-ENOENT); if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) goto out_put_task; @@ -2081,22 +2369,27 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, if (!mm) goto out_put_task; - down_read(&mm->mmap_sem); + result = ERR_PTR(-EINTR); + if (mmap_read_lock_killable(mm)) + goto out_put_mm; + + result = ERR_PTR(-ENOENT); vma = find_exact_vma(mm, vm_start, vm_end); if (!vma) goto out_no_vma; if (vma->vm_file) - result = proc_map_files_instantiate(dir, dentry, task, + result = proc_map_files_instantiate(dentry, task, (void *)(unsigned long)vma->vm_file->f_mode); out_no_vma: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); +out_put_mm: mmput(mm); out_put_task: put_task_struct(task); out: - return ERR_PTR(result); + return result; } static const struct inode_operations proc_map_files_inode_operations = { @@ -2112,10 +2405,12 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) struct task_struct *task; struct mm_struct *mm; unsigned long nr_files, pos, i; - struct flex_array *fa = NULL; - struct map_files_info info; + GENRADIX(struct map_files_info) fa; struct map_files_info *p; int ret; + struct vma_iterator vmi; + + genradix_init(&fa); ret = -ENOENT; task = get_proc_task(file_inode(file)); @@ -2133,71 +2428,67 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) mm = get_task_mm(task); if (!mm) goto out_put_task; - down_read(&mm->mmap_sem); + + ret = mmap_read_lock_killable(mm); + if (ret) { + mmput(mm); + goto out_put_task; + } nr_files = 0; /* * We need two passes here: * - * 1) Collect vmas of mapped files with mmap_sem taken - * 2) Release mmap_sem and instantiate entries + * 1) Collect vmas of mapped files with mmap_lock taken + * 2) Release mmap_lock and instantiate entries * * otherwise we get lockdep complained, since filldir() - * routine might require mmap_sem taken in might_fault(). + * routine might require mmap_lock taken in might_fault(). */ - for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { - if (vma->vm_file && ++pos > ctx->pos) - nr_files++; - } + pos = 2; + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { + if (!vma->vm_file) + continue; + if (++pos <= ctx->pos) + continue; - if (nr_files) { - fa = flex_array_alloc(sizeof(info), nr_files, - GFP_KERNEL); - if (!fa || flex_array_prealloc(fa, 0, nr_files, - GFP_KERNEL)) { + p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL); + if (!p) { ret = -ENOMEM; - if (fa) - flex_array_free(fa); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); mmput(mm); goto out_put_task; } - for (i = 0, vma = mm->mmap, pos = 2; vma; - vma = vma->vm_next) { - if (!vma->vm_file) - continue; - if (++pos <= ctx->pos) - continue; - info.mode = vma->vm_file->f_mode; - info.len = snprintf(info.name, - sizeof(info.name), "%lx-%lx", - vma->vm_start, vma->vm_end); - if (flex_array_put(fa, i++, &info, GFP_KERNEL)) - BUG(); - } + p->start = vma->vm_start; + p->end = vma->vm_end; + p->mode = vma->vm_file->f_mode; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); + mmput(mm); for (i = 0; i < nr_files; i++) { - p = flex_array_get(fa, i); + char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ + unsigned int len; + + p = genradix_ptr(&fa, i); + len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end); if (!proc_fill_cache(file, ctx, - p->name, p->len, + buf, len, proc_map_files_instantiate, task, (void *)(unsigned long)p->mode)) break; ctx->pos++; } - if (fa) - flex_array_free(fa); - mmput(mm); out_put_task: put_task_struct(task); out: + genradix_free(&fa); return ret; } @@ -2209,11 +2500,9 @@ static const struct file_operations proc_map_files_operations = { #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) struct timers_private { - struct pid *pid; - struct task_struct *task; - struct sighand_struct *sighand; - struct pid_namespace *ns; - unsigned long flags; + struct pid *pid; + struct task_struct *task; + struct pid_namespace *ns; }; static void *timers_start(struct seq_file *m, loff_t *pos) @@ -2224,54 +2513,48 @@ static void *timers_start(struct seq_file *m, loff_t *pos) if (!tp->task) return ERR_PTR(-ESRCH); - tp->sighand = lock_task_sighand(tp->task, &tp->flags); - if (!tp->sighand) - return ERR_PTR(-ESRCH); - - return seq_list_start(&tp->task->signal->posix_timers, *pos); + rcu_read_lock(); + return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos); } static void *timers_next(struct seq_file *m, void *v, loff_t *pos) { struct timers_private *tp = m->private; - return seq_list_next(v, &tp->task->signal->posix_timers, pos); + + return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos); } static void timers_stop(struct seq_file *m, void *v) { struct timers_private *tp = m->private; - if (tp->sighand) { - unlock_task_sighand(tp->task, &tp->flags); - tp->sighand = NULL; - } - if (tp->task) { put_task_struct(tp->task); tp->task = NULL; + rcu_read_unlock(); } } static int show_timer(struct seq_file *m, void *v) { - struct k_itimer *timer; - struct timers_private *tp = m->private; - int notify; static const char * const nstr[] = { - [SIGEV_SIGNAL] = "signal", - [SIGEV_NONE] = "none", - [SIGEV_THREAD] = "thread", + [SIGEV_SIGNAL] = "signal", + [SIGEV_NONE] = "none", + [SIGEV_THREAD] = "thread", }; - timer = list_entry((struct list_head *)v, struct k_itimer, list); - notify = timer->it_sigev_notify; + struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); + struct timers_private *tp = m->private; + int notify = timer->it_sigev_notify; + + guard(spinlock_irq)(&timer->it_lock); + if (!posixtimer_valid(timer)) + return 0; seq_printf(m, "ID: %d\n", timer->it_id); - seq_printf(m, "signal: %d/%p\n", - timer->sigq->info.si_signo, - timer->sigq->info.si_value.sival_ptr); - seq_printf(m, "notify: %s/%s.%d\n", - nstr[notify & ~SIGEV_THREAD_ID], + seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo, + timer->sigq.info.si_value.sival_ptr); + seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID], (notify & SIGEV_THREAD_ID) ? "tid" : "pid", pid_nr_ns(timer->it_pid, tp->ns)); seq_printf(m, "ClockID: %d\n", timer->it_clock); @@ -2296,7 +2579,7 @@ static int proc_timers_open(struct inode *inode, struct file *file) return -ENOMEM; tp->pid = proc_pid(inode); - tp->ns = inode->i_sb->s_fs_info; + tp->ns = proc_pid_ns(inode->i_sb); return 0; } @@ -2325,10 +2608,13 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, return -ESRCH; if (p != current) { - if (!capable(CAP_SYS_NICE)) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); count = -EPERM; goto out; } + rcu_read_unlock(); err = security_task_setscheduler(p); if (err) { @@ -2338,10 +2624,11 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, } task_lock(p); - if (slack_ns == 0) - p->timer_slack_ns = p->default_timer_slack_ns; - else - p->timer_slack_ns = slack_ns; + if (rt_or_dl_task_policy(p)) + slack_ns = 0; + else if (slack_ns == 0) + slack_ns = p->default_timer_slack_ns; + p->timer_slack_ns = slack_ns; task_unlock(p); out: @@ -2361,11 +2648,14 @@ static int timerslack_ns_show(struct seq_file *m, void *v) return -ESRCH; if (p != current) { - - if (!capable(CAP_SYS_NICE)) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); err = -EPERM; goto out; } + rcu_read_unlock(); + err = security_task_getscheduler(p); if (err) goto out; @@ -2394,16 +2684,16 @@ static const struct file_operations proc_pid_set_timerslack_ns_operations = { .release = single_release, }; -static int proc_pident_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) +static struct dentry *proc_pident_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) { const struct pid_entry *p = ptr; struct inode *inode; struct proc_inode *ei; - inode = proc_pid_make_inode(dir->i_sb, task, p->mode); + inode = proc_pid_make_inode(dentry->d_sb, task, p->mode); if (!inode) - goto out; + return ERR_PTR(-ENOENT); ei = PROC_I(inode); if (S_ISDIR(inode->i_mode)) @@ -2413,25 +2703,17 @@ static int proc_pident_instantiate(struct inode *dir, if (p->fop) inode->i_fop = p->fop; ei->op = p->op; - d_set_d_op(dentry, &pid_dentry_operations); - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, 0)) - return 0; -out: - return -ENOENT; + pid_update_inode(task, inode); + return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } static struct dentry *proc_pident_lookup(struct inode *dir, struct dentry *dentry, - const struct pid_entry *ents, - unsigned int nents) + const struct pid_entry *p, + const struct pid_entry *end) { - int error; struct task_struct *task = get_proc_task(dir); - const struct pid_entry *p, *last; - - error = -ENOENT; + struct dentry *res = ERR_PTR(-ENOENT); if (!task) goto out_no_task; @@ -2440,21 +2722,17 @@ static struct dentry *proc_pident_lookup(struct inode *dir, * Yes, it does not scale. And it should not. Don't add * new entries into /proc/<tgid>/ without very good reasons. */ - last = &ents[nents]; - for (p = ents; p < last; p++) { + for (; p < end; p++) { if (p->len != dentry->d_name.len) continue; - if (!memcmp(dentry->d_name.name, p->name, p->len)) + if (!memcmp(dentry->d_name.name, p->name, p->len)) { + res = proc_pident_instantiate(dentry, task, p); break; + } } - if (p >= last) - goto out; - - error = proc_pident_instantiate(dir, dentry, task, p); -out: put_task_struct(task); out_no_task: - return ERR_PTR(error); + return res; } static int proc_pident_readdir(struct file *file, struct dir_context *ctx, @@ -2484,6 +2762,13 @@ out: } #ifdef CONFIG_SECURITY +static int proc_pid_attr_open(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); + return 0; +} + static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { @@ -2495,8 +2780,8 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, if (!task) return -ESRCH; - length = security_getprocattr(task, - (char*)file->f_path.dentry->d_name.name, + length = security_getprocattr(task, PROC_I(inode)->op.lsmid, + file->f_path.dentry->d_name.name, &p); put_task_struct(task); if (length > 0) @@ -2509,62 +2794,128 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); + struct task_struct *task; void *page; - ssize_t length; - struct task_struct *task = get_proc_task(inode); + int rv; - length = -ESRCH; - if (!task) - goto out_no_task; + /* A task may only write when it was the opener. */ + if (file->private_data != current->mm) + return -EPERM; + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (!task) { + rcu_read_unlock(); + return -ESRCH; + } /* A task may only write its own attributes. */ - length = -EACCES; - if (current != task) - goto out; + if (current != task) { + rcu_read_unlock(); + return -EACCES; + } + /* Prevent changes to overridden credentials. */ + if (current_cred() != current_real_cred()) { + rcu_read_unlock(); + return -EBUSY; + } + rcu_read_unlock(); if (count > PAGE_SIZE) count = PAGE_SIZE; /* No partial writes. */ - length = -EINVAL; if (*ppos != 0) - goto out; + return -EINVAL; page = memdup_user(buf, count); if (IS_ERR(page)) { - length = PTR_ERR(page); + rv = PTR_ERR(page); goto out; } /* Guard against adverse ptrace interaction */ - length = mutex_lock_interruptible(¤t->signal->cred_guard_mutex); - if (length < 0) + rv = mutex_lock_interruptible(¤t->signal->cred_guard_mutex); + if (rv < 0) goto out_free; - length = security_setprocattr(file->f_path.dentry->d_name.name, - page, count); + rv = security_setprocattr(PROC_I(inode)->op.lsmid, + file->f_path.dentry->d_name.name, page, + count); mutex_unlock(¤t->signal->cred_guard_mutex); out_free: kfree(page); out: - put_task_struct(task); -out_no_task: - return length; + return rv; } static const struct file_operations proc_pid_attr_operations = { + .open = proc_pid_attr_open, .read = proc_pid_attr_read, .write = proc_pid_attr_write, .llseek = generic_file_llseek, + .release = mem_release, }; +#define LSM_DIR_OPS(LSM) \ +static int proc_##LSM##_attr_dir_iterate(struct file *filp, \ + struct dir_context *ctx) \ +{ \ + return proc_pident_readdir(filp, ctx, \ + LSM##_attr_dir_stuff, \ + ARRAY_SIZE(LSM##_attr_dir_stuff)); \ +} \ +\ +static const struct file_operations proc_##LSM##_attr_dir_ops = { \ + .read = generic_read_dir, \ + .iterate_shared = proc_##LSM##_attr_dir_iterate, \ + .llseek = default_llseek, \ +}; \ +\ +static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \ + struct dentry *dentry, unsigned int flags) \ +{ \ + return proc_pident_lookup(dir, dentry, \ + LSM##_attr_dir_stuff, \ + LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \ +} \ +\ +static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \ + .lookup = proc_##LSM##_attr_dir_lookup, \ + .getattr = pid_getattr, \ + .setattr = proc_setattr, \ +} + +#ifdef CONFIG_SECURITY_SMACK +static const struct pid_entry smack_attr_dir_stuff[] = { + ATTR(LSM_ID_SMACK, "current", 0666), +}; +LSM_DIR_OPS(smack); +#endif + +#ifdef CONFIG_SECURITY_APPARMOR +static const struct pid_entry apparmor_attr_dir_stuff[] = { + ATTR(LSM_ID_APPARMOR, "current", 0666), + ATTR(LSM_ID_APPARMOR, "prev", 0444), + ATTR(LSM_ID_APPARMOR, "exec", 0666), +}; +LSM_DIR_OPS(apparmor); +#endif + static const struct pid_entry attr_dir_stuff[] = { - REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations), - REG("prev", S_IRUGO, proc_pid_attr_operations), - REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations), - REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), - REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), - REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + ATTR(LSM_ID_UNDEF, "current", 0666), + ATTR(LSM_ID_UNDEF, "prev", 0444), + ATTR(LSM_ID_UNDEF, "exec", 0666), + ATTR(LSM_ID_UNDEF, "fscreate", 0666), + ATTR(LSM_ID_UNDEF, "keycreate", 0666), + ATTR(LSM_ID_UNDEF, "sockcreate", 0666), +#ifdef CONFIG_SECURITY_SMACK + DIR("smack", 0555, + proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops), +#endif +#ifdef CONFIG_SECURITY_APPARMOR + DIR("apparmor", 0555, + proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops), +#endif }; static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx) @@ -2583,7 +2934,8 @@ static struct dentry *proc_attr_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, - attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); + attr_dir_stuff, + attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff)); } static const struct inode_operations proc_attr_dir_inode_operations = { @@ -2610,8 +2962,10 @@ static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, ret = 0; mm = get_task_mm(task); if (mm) { + unsigned long flags = __mm_flags_get_dumpable(mm); + len = snprintf(buffer, sizeof(buffer), "%08lx\n", - ((mm->flags & MMF_DUMP_FILTER_MASK) >> + ((flags & MMF_DUMP_FILTER_MASK) >> MMF_DUMP_FILTER_SHIFT)); mmput(mm); ret = simple_read_from_buffer(buf, count, ppos, buffer, len); @@ -2650,9 +3004,9 @@ static ssize_t proc_coredump_filter_write(struct file *file, for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) { if (val & mask) - set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + mm_flags_set(i + MMF_DUMP_FILTER_SHIFT, mm); else - clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + mm_flags_clear(i + MMF_DUMP_FILTER_SHIFT, mm); } mmput(mm); @@ -2674,11 +3028,10 @@ static const struct file_operations proc_coredump_filter_operations = { #ifdef CONFIG_TASK_IO_ACCOUNTING static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole) { - struct task_io_accounting acct = task->ioac; - unsigned long flags; + struct task_io_accounting acct; int result; - result = mutex_lock_killable(&task->signal->cred_guard_mutex); + result = down_read_killable(&task->signal->exec_update_lock); if (result) return result; @@ -2687,15 +3040,21 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh goto out_unlock; } - if (whole && lock_task_sighand(task, &flags)) { - struct task_struct *t = task; + if (whole) { + struct signal_struct *sig = task->signal; + struct task_struct *t; - task_io_accounting_add(&acct, &task->signal->ioac); - while_each_thread(task, t) - task_io_accounting_add(&acct, &t->ioac); + guard(rcu)(); + scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { + acct = sig->ioac; + __for_each_thread(sig, t) + task_io_accounting_add(&acct, &t->ioac); - unlock_task_sighand(task, &flags); + } + } else { + acct = task->ioac; } + seq_printf(m, "rchar: %llu\n" "wchar: %llu\n" @@ -2714,7 +3073,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh result = 0; out_unlock: - mutex_unlock(&task->signal->cred_guard_mutex); + up_read(&task->signal->exec_update_lock); return result; } @@ -2883,6 +3242,64 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns, } #endif /* CONFIG_LIVEPATCH */ +#ifdef CONFIG_KSM +static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm; + + mm = get_task_mm(task); + if (mm) { + seq_printf(m, "%lu\n", mm->ksm_merging_pages); + mmput(mm); + } + + return 0; +} +static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm; + int ret = 0; + + mm = get_task_mm(task); + if (mm) { + seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm)); + seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); + seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); + seq_printf(m, "ksm_merge_any: %s\n", + mm_flags_test(MMF_VM_MERGE_ANY, mm) ? "yes" : "no"); + ret = mmap_read_lock_killable(mm); + if (ret) { + mmput(mm); + return ret; + } + seq_printf(m, "ksm_mergeable: %s\n", + ksm_process_mergeable(mm) ? "yes" : "no"); + mmap_read_unlock(mm); + mmput(mm); + } + + return 0; +} +#endif /* CONFIG_KSM */ + +#ifdef CONFIG_KSTACK_ERASE_METRICS +static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + unsigned long prev_depth = THREAD_SIZE - + (task->prev_lowest_stack & (THREAD_SIZE - 1)); + unsigned long depth = THREAD_SIZE - + (task->lowest_stack & (THREAD_SIZE - 1)); + + seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n", + prev_depth, depth); + return 0; +} +#endif /* CONFIG_KSTACK_ERASE_METRICS */ + /* * Thread groups */ @@ -2893,7 +3310,7 @@ static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), - DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), @@ -2903,12 +3320,13 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), -#ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), -#endif #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif +#ifdef CONFIG_TIME_NS + REG("timens_offsets", S_IRUGO|S_IWUSR, proc_timens_offsets_operations), +#endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), @@ -2930,6 +3348,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_pid_smaps_operations), + REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY @@ -2953,10 +3372,13 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_CGROUPS ONE("cgroup", S_IRUGO, proc_cgroup_show), #endif +#ifdef CONFIG_PROC_CPU_RESCTRL + ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show), +#endif ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), -#ifdef CONFIG_AUDITSYSCALL +#ifdef CONFIG_AUDIT REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), #endif @@ -2970,9 +3392,6 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tgid_io_accounting), #endif -#ifdef CONFIG_HARDWALL - ONE("hardwall", S_IRUGO, proc_pid_hardwall), -#endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), @@ -2986,6 +3405,19 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif +#ifdef CONFIG_KSTACK_ERASE_METRICS + ONE("stack_depth", S_IRUGO, proc_stack_depth), +#endif +#ifdef CONFIG_PROC_PID_ARCH_STATUS + ONE("arch_status", S_IRUGO, proc_pid_arch_status), +#endif +#ifdef CONFIG_SECCOMP_CACHE_DEBUG + ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), +#endif +#ifdef CONFIG_KSM + ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), + ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3000,10 +3432,19 @@ static const struct file_operations proc_tgid_base_operations = { .llseek = generic_file_llseek, }; +struct pid *tgid_pidfd_to_pid(const struct file *file) +{ + if (file->f_op != &proc_tgid_base_operations) + return ERR_PTR(-EBADF); + + return proc_pid(file_inode(file)); +} + static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, - tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); + tgid_base_stuff, + tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff)); } static const struct inode_operations proc_tgid_base_inode_operations = { @@ -3013,130 +3454,64 @@ static const struct inode_operations proc_tgid_base_inode_operations = { .permission = proc_pid_permission, }; -static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) -{ - struct dentry *dentry, *leader, *dir; - char buf[PROC_NUMBUF]; - struct qstr name; - - name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", pid); - /* no ->d_hash() rejects on procfs */ - dentry = d_hash_and_lookup(mnt->mnt_root, &name); - if (dentry) { - d_invalidate(dentry); - dput(dentry); - } - - if (pid == tgid) - return; - - name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", tgid); - leader = d_hash_and_lookup(mnt->mnt_root, &name); - if (!leader) - goto out; - - name.name = "task"; - name.len = strlen(name.name); - dir = d_hash_and_lookup(leader, &name); - if (!dir) - goto out_put_leader; - - name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", pid); - dentry = d_hash_and_lookup(dir, &name); - if (dentry) { - d_invalidate(dentry); - dput(dentry); - } - - dput(dir); -out_put_leader: - dput(leader); -out: - return; -} - /** - * proc_flush_task - Remove dcache entries for @task from the /proc dcache. - * @task: task that should be flushed. - * - * When flushing dentries from proc, one needs to flush them from global - * proc (proc_mnt) and from all the namespaces' procs this task was seen - * in. This call is supposed to do all of this job. + * proc_flush_pid - Remove dcache entries for @pid from the /proc dcache. + * @pid: pid that should be flushed. * - * Looks in the dcache for - * /proc/@pid - * /proc/@tgid/task/@pid - * if either directory is present flushes it and all of it'ts children - * from the dcache. + * This function walks a list of inodes (that belong to any proc + * filesystem) that are attached to the pid and flushes them from + * the dentry cache. * * It is safe and reasonable to cache /proc entries for a task until * that task exits. After that they just clog up the dcache with * useless entries, possibly causing useful dcache entries to be - * flushed instead. This routine is proved to flush those useless - * dcache entries at process exit time. + * flushed instead. This routine is provided to flush those useless + * dcache entries when a process is reaped. * * NOTE: This routine is just an optimization so it does not guarantee - * that no dcache entries will exist at process exit time it - * just makes it very unlikely that any will persist. + * that no dcache entries will exist after a process is reaped + * it just makes it very unlikely that any will persist. */ -void proc_flush_task(struct task_struct *task) +void proc_flush_pid(struct pid *pid) { - int i; - struct pid *pid, *tgid; - struct upid *upid; - - pid = task_pid(task); - tgid = task_tgid(task); - - for (i = 0; i <= pid->level; i++) { - upid = &pid->numbers[i]; - proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, - tgid->numbers[i].nr); - } + proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock); } -static int proc_pid_instantiate(struct inode *dir, - struct dentry * dentry, +static struct dentry *proc_pid_instantiate(struct dentry * dentry, struct task_struct *task, const void *ptr) { struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); + inode = proc_pid_make_base_inode(dentry->d_sb, task, + S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) - goto out; + return ERR_PTR(-ENOENT); inode->i_op = &proc_tgid_base_inode_operations; inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; set_nlink(inode, nlink_tgid); + pid_update_inode(task, inode); - d_set_d_op(dentry, &pid_dentry_operations); - - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, 0)) - return 0; -out: - return -ENOENT; + return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } -struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) +struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) { - int result = -ENOENT; struct task_struct *task; unsigned tgid; + struct proc_fs_info *fs_info; struct pid_namespace *ns; + struct dentry *result = ERR_PTR(-ENOENT); tgid = name_to_int(&dentry->d_name); if (tgid == ~0U) goto out; - ns = dentry->d_sb->s_fs_info; + fs_info = proc_sb_info(dentry->d_sb); + ns = fs_info->pid_ns; rcu_read_lock(); task = find_task_by_pid_ns(tgid, ns); if (task) @@ -3145,10 +3520,17 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsign if (!task) goto out; - result = proc_pid_instantiate(dir, dentry, task, NULL); + /* Limit procfs to only ptraceable tasks */ + if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) { + if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS)) + goto out_put_task; + } + + result = proc_pid_instantiate(dentry, task, NULL); +out_put_task: put_task_struct(task); out: - return ERR_PTR(result); + return result; } /* @@ -3171,20 +3553,8 @@ retry: pid = find_ge_pid(iter.tgid, ns); if (pid) { iter.tgid = pid_nr_ns(pid, ns); - iter.task = pid_task(pid, PIDTYPE_PID); - /* What we to know is if the pid we have find is the - * pid of a thread_group_leader. Testing for task - * being a thread_group_leader is the obvious thing - * todo but there is a window when it fails, due to - * the pid transfer logic in de_thread. - * - * So we perform the straight forward test of seeing - * if the pid we have found is the pid of a thread - * group leader, and don't worry if the task we have - * found doesn't happen to be a thread group leader. - * As we don't care in the case of readdir. - */ - if (!iter.task || !has_group_leader_pid(iter.task)) { + iter.task = pid_task(pid, PIDTYPE_TGID); + if (!iter.task) { iter.tgid += 1; goto retry; } @@ -3200,21 +3570,20 @@ retry: int proc_pid_readdir(struct file *file, struct dir_context *ctx) { struct tgid_iter iter; - struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info; + struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb); + struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb); loff_t pos = ctx->pos; if (pos >= PID_MAX_LIMIT + TGID_OFFSET) return 0; if (pos == TGID_OFFSET - 2) { - struct inode *inode = d_inode(ns->proc_self); - if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) + if (!dir_emit(ctx, "self", 4, self_inum, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } if (pos == TGID_OFFSET - 1) { - struct inode *inode = d_inode(ns->proc_thread_self); - if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) + if (!dir_emit(ctx, "thread-self", 11, thread_self_inum, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } @@ -3223,14 +3592,14 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { - char name[PROC_NUMBUF]; - int len; + char name[10 + 1]; + unsigned int len; cond_resched(); - if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE)) + if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE)) continue; - len = snprintf(name, sizeof(name), "%d", iter.tgid); + len = snprintf(name, sizeof(name), "%u", iter.tgid); ctx->pos = iter.tgid + TGID_OFFSET; if (!proc_fill_cache(file, ctx, name, len, proc_pid_instantiate, iter.task, NULL)) { @@ -3254,7 +3623,8 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) * This function makes sure that the node is always accessible for members of * same thread group. */ -static int proc_tid_comm_permission(struct inode *inode, int mask) +static int proc_tid_comm_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask) { bool is_same_tgroup; struct task_struct *task; @@ -3273,11 +3643,12 @@ static int proc_tid_comm_permission(struct inode *inode, int mask) return 0; } - return generic_permission(inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } static const struct inode_operations proc_tid_comm_inode_operations = { - .permission = proc_tid_comm_permission, + .setattr = proc_setattr, + .permission = proc_tid_comm_permission, }; /* @@ -3285,7 +3656,7 @@ static const struct inode_operations proc_tid_comm_inode_operations = { */ static const struct pid_entry tid_base_stuff[] = { DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), - DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), @@ -3295,9 +3666,7 @@ static const struct pid_entry tid_base_stuff[] = { ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), -#ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), -#endif NOD("comm", S_IFREG|S_IRUGO|S_IWUSR, &proc_tid_comm_inode_operations, &proc_pid_set_comm_operations, {}), @@ -3307,12 +3676,12 @@ static const struct pid_entry tid_base_stuff[] = { REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), ONE("stat", S_IRUGO, proc_tid_stat), ONE("statm", S_IRUGO, proc_pid_statm), - REG("maps", S_IRUGO, proc_tid_maps_operations), + REG("maps", S_IRUGO, proc_pid_maps_operations), #ifdef CONFIG_PROC_CHILDREN REG("children", S_IRUGO, proc_tid_children_operations), #endif #ifdef CONFIG_NUMA - REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations), + REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), #endif REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), LNK("cwd", proc_cwd_link), @@ -3322,7 +3691,8 @@ static const struct pid_entry tid_base_stuff[] = { REG("mountinfo", S_IRUGO, proc_mountinfo_operations), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), - REG("smaps", S_IRUGO, proc_tid_smaps_operations), + REG("smaps", S_IRUGO, proc_pid_smaps_operations), + REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY @@ -3346,10 +3716,13 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_CGROUPS ONE("cgroup", S_IRUGO, proc_cgroup_show), #endif +#ifdef CONFIG_PROC_CPU_RESCTRL + ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show), +#endif ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), -#ifdef CONFIG_AUDITSYSCALL +#ifdef CONFIG_AUDIT REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), #endif @@ -3360,9 +3733,6 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tid_io_accounting), #endif -#ifdef CONFIG_HARDWALL - ONE("hardwall", S_IRUGO, proc_pid_hardwall), -#endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), @@ -3372,6 +3742,16 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif +#ifdef CONFIG_PROC_PID_ARCH_STATUS + ONE("arch_status", S_IRUGO, proc_pid_arch_status), +#endif +#ifdef CONFIG_SECCOMP_CACHE_DEBUG + ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), +#endif +#ifdef CONFIG_KSM + ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), + ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3383,7 +3763,8 @@ static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, - tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); + tid_base_stuff, + tid_base_stuff + ARRAY_SIZE(tid_base_stuff)); } static const struct file_operations proc_tid_base_operations = { @@ -3398,37 +3779,33 @@ static const struct inode_operations proc_tid_base_inode_operations = { .setattr = proc_setattr, }; -static int proc_task_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) +static struct dentry *proc_task_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) { struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); - + inode = proc_pid_make_base_inode(dentry->d_sb, task, + S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) - goto out; + return ERR_PTR(-ENOENT); + inode->i_op = &proc_tid_base_inode_operations; inode->i_fop = &proc_tid_base_operations; - inode->i_flags|=S_IMMUTABLE; + inode->i_flags |= S_IMMUTABLE; set_nlink(inode, nlink_tid); + pid_update_inode(task, inode); - d_set_d_op(dentry, &pid_dentry_operations); - - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, 0)) - return 0; -out: - return -ENOENT; + return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - int result = -ENOENT; struct task_struct *task; struct task_struct *leader = get_proc_task(dir); unsigned tid; + struct proc_fs_info *fs_info; struct pid_namespace *ns; + struct dentry *result = ERR_PTR(-ENOENT); if (!leader) goto out_no_task; @@ -3437,7 +3814,8 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry if (tid == ~0U) goto out; - ns = dentry->d_sb->s_fs_info; + fs_info = proc_sb_info(dentry->d_sb); + ns = fs_info->pid_ns; rcu_read_lock(); task = find_task_by_pid_ns(tid, ns); if (task) @@ -3448,13 +3826,13 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry if (!same_thread_group(leader, task)) goto out_drop_task; - result = proc_task_instantiate(dir, dentry, task, NULL); + result = proc_task_instantiate(dentry, task, NULL); out_drop_task: put_task_struct(task); out: put_task_struct(leader); out_no_task: - return ERR_PTR(result); + return result; } /* @@ -3497,11 +3875,10 @@ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos, /* If we haven't found our starting place yet start * with the leader and walk nr threads forward. */ - pos = task = task->group_leader; - do { + for_each_thread(task, pos) { if (!nr--) goto found; - } while_each_thread(task, pos); + } fail: pos = NULL; goto out; @@ -3523,10 +3900,8 @@ static struct task_struct *next_tid(struct task_struct *start) struct task_struct *pos = NULL; rcu_read_lock(); if (pid_alive(start)) { - pos = next_thread(start); - if (thread_group_leader(pos)) - pos = NULL; - else + pos = __next_thread(start); + if (pos) get_task_struct(pos); } rcu_read_unlock(); @@ -3548,24 +3923,27 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit_dots(file, ctx)) return 0; - /* f_version caches the tgid value that the last readdir call couldn't - * return. lseek aka telldir automagically resets f_version to 0. + /* We cache the tgid value that the last readdir call couldn't + * return and lseek resets it to 0. */ - ns = inode->i_sb->s_fs_info; - tid = (int)file->f_version; - file->f_version = 0; + ns = proc_pid_ns(inode->i_sb); + tid = (int)(intptr_t)file->private_data; + file->private_data = NULL; for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; task = next_tid(task), ctx->pos++) { - char name[PROC_NUMBUF]; - int len; + char name[10 + 1]; + unsigned int len; + tid = task_pid_nr_ns(task, ns); + if (!tid) + continue; /* The task has just exited. */ len = snprintf(name, sizeof(name), "%d", tid); if (!proc_fill_cache(file, ctx, name, len, proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first * pid for the next readir call */ - file->f_version = (u64)tid; + file->private_data = (void *)(intptr_t)tid; put_task_struct(task); break; } @@ -3574,12 +3952,13 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) return 0; } -static int proc_task_getattr(const struct path *path, struct kstat *stat, +static int proc_task_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct task_struct *p = get_proc_task(inode); - generic_fillattr(inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (p) { stat->nlink += get_nr_threads(p); @@ -3589,6 +3968,24 @@ static int proc_task_getattr(const struct path *path, struct kstat *stat, return 0; } +/* + * proc_task_readdir() set @file->private_data to a positive integer + * value, so casting that to u64 is safe. generic_llseek_cookie() will + * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is + * here to catch any unexpected change in behavior either in + * proc_task_readdir() or generic_llseek_cookie(). + */ +static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence) +{ + u64 cookie = (u64)(intptr_t)file->private_data; + loff_t off; + + off = generic_llseek_cookie(file, offset, whence, &cookie); + WARN_ON_ONCE(cookie > INT_MAX); + file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */ + return off; +} + static const struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, @@ -3599,7 +3996,7 @@ static const struct inode_operations proc_task_inode_operations = { static const struct file_operations proc_task_operations = { .read = generic_read_dir, .iterate_shared = proc_task_readdir, - .llseek = generic_file_llseek, + .llseek = proc_dir_llseek, }; void __init set_proc_pid_nlink(void) diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c new file mode 100644 index 000000000000..87dcaae32ff8 --- /dev/null +++ b/fs/proc/bootconfig.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * /proc/bootconfig - Extra boot configuration + */ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/printk.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/bootconfig.h> +#include <linux/slab.h> + +static char *saved_boot_config; + +static int boot_config_proc_show(struct seq_file *m, void *v) +{ + if (saved_boot_config) + seq_puts(m, saved_boot_config); + return 0; +} + +/* Rest size of buffer */ +#define rest(dst, end) ((end) > (dst) ? (end) - (dst) : 0) + +/* Return the needed total length if @size is 0 */ +static int __init copy_xbc_key_value_list(char *dst, size_t size) +{ + struct xbc_node *leaf, *vnode; + char *key, *end = dst + size; + const char *val; + char q; + int ret = 0; + + key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL); + if (!key) + return -ENOMEM; + + xbc_for_each_key_value(leaf, val) { + ret = xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX); + if (ret < 0) + break; + ret = snprintf(dst, rest(dst, end), "%s = ", key); + if (ret < 0) + break; + dst += ret; + vnode = xbc_node_get_child(leaf); + if (vnode) { + xbc_array_for_each_value(vnode, val) { + if (strchr(val, '"')) + q = '\''; + else + q = '"'; + ret = snprintf(dst, rest(dst, end), "%c%s%c%s", + q, val, q, xbc_node_is_array(vnode) ? ", " : "\n"); + if (ret < 0) + goto out; + dst += ret; + } + } else { + ret = snprintf(dst, rest(dst, end), "\"\"\n"); + if (ret < 0) + break; + dst += ret; + } + } + if (cmdline_has_extra_options() && ret >= 0 && boot_command_line[0]) { + ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n", + boot_command_line); + if (ret > 0) + dst += ret; + } +out: + kfree(key); + + return ret < 0 ? ret : dst - (end - size); +} + +static int __init proc_boot_config_init(void) +{ + int len; + + len = copy_xbc_key_value_list(NULL, 0); + if (len < 0) + return len; + + if (len > 0) { + saved_boot_config = kzalloc(len + 1, GFP_KERNEL); + if (!saved_boot_config) + return -ENOMEM; + + len = copy_xbc_key_value_list(saved_boot_config, len + 1); + if (len < 0) { + kfree(saved_boot_config); + return len; + } + } + + proc_create_single("bootconfig", 0, NULL, boot_config_proc_show); + + return 0; +} +fs_initcall(proc_boot_config_init); diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index cbd82dff7e81..a6f76121955f 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -1,29 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include "internal.h" static int cmdline_proc_show(struct seq_file *m, void *v) { - seq_printf(m, "%s\n", saved_command_line); + seq_puts(m, saved_command_line); + seq_putc(m, '\n'); return 0; } -static int cmdline_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, cmdline_proc_show, NULL); -} - -static const struct file_operations cmdline_proc_fops = { - .open = cmdline_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_cmdline_init(void) { - proc_create("cmdline", 0, NULL, &cmdline_proc_fops); + struct proc_dir_entry *pde; + + pde = proc_create_single("cmdline", 0, NULL, cmdline_proc_show); + pde_make_permanent(pde); + pde->size = saved_command_line_len + 1; return 0; } fs_initcall(proc_cmdline_init); diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index 290ba85cb900..b7cab1ad990d 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -1,7 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2010 Werner Fink, Jiri Slaby - * - * Licensed under GPLv2 */ #include <linux/console.h> @@ -22,6 +21,7 @@ static int show_console_dev(struct seq_file *m, void *v) { CON_ENABLED, 'E' }, { CON_CONSDEV, 'C' }, { CON_BOOT, 'B' }, + { CON_NBCON, 'N' }, { CON_PRINTBUFFER, 'p' }, { CON_BRL, 'b' }, { CON_ANYTIME, 'a' }, @@ -34,7 +34,16 @@ static int show_console_dev(struct seq_file *m, void *v) if (con->device) { const struct tty_driver *driver; int index; + + /* + * Take console_lock to serialize device() callback with + * other console operations. For example, fg_console is + * modified under console_lock when switching vt. + */ + console_lock(); driver = con->device(con, &index); + console_unlock(); + if (driver) { dev = MKDEV(driver->major, driver->minor_start); dev += index; @@ -50,22 +59,27 @@ static int show_console_dev(struct seq_file *m, void *v) seq_printf(m, "%s%d", con->name, con->index); seq_pad(m, ' '); seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', - con->write ? 'W' : '-', con->unblank ? 'U' : '-', - flags); + ((con->flags & CON_NBCON) || con->write) ? 'W' : '-', + con->unblank ? 'U' : '-', flags); if (dev) seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); - seq_printf(m, "\n"); - + seq_putc(m, '\n'); return 0; } static void *c_start(struct seq_file *m, loff_t *pos) + __acquires(&console_mutex) { struct console *con; loff_t off = 0; - console_lock(); + /* + * Hold the console_list_lock to guarantee safe traversal of the + * console list. SRCU cannot be used because there is no + * place to store the SRCU cookie. + */ + console_list_lock(); for_each_console(con) if (off++ == *pos) break; @@ -76,13 +90,15 @@ static void *c_start(struct seq_file *m, loff_t *pos) static void *c_next(struct seq_file *m, void *v, loff_t *pos) { struct console *con = v; + ++*pos; - return con->next; + return hlist_entry_safe(con->node.next, struct console, node); } static void c_stop(struct seq_file *m, void *v) + __releases(&console_mutex) { - console_unlock(); + console_list_unlock(); } static const struct seq_operations consoles_op = { @@ -92,21 +108,9 @@ static const struct seq_operations consoles_op = { .show = show_console_dev }; -static int consoles_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &consoles_op); -} - -static const struct file_operations proc_consoles_operations = { - .open = consoles_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_consoles_init(void) { - proc_create("consoles", 0, NULL, &proc_consoles_operations); + proc_create_seq("consoles", 0, NULL, &consoles_op); return 0; } fs_initcall(proc_consoles_init); diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index 06f4d31e0396..f38bda5b83ec 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -1,24 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cpufreq.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> extern const struct seq_operations cpuinfo_op; + static int cpuinfo_open(struct inode *inode, struct file *file) { return seq_open(file, &cpuinfo_op); } -static const struct file_operations proc_cpuinfo_operations = { - .open = cpuinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, +static const struct proc_ops cpuinfo_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_open = cpuinfo_open, + .proc_read_iter = seq_read_iter, + .proc_lseek = seq_lseek, + .proc_release = seq_release, }; static int __init proc_cpuinfo_init(void) { - proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); + proc_create("cpuinfo", 0, NULL, &cpuinfo_proc_ops); return 0; } fs_initcall(proc_cpuinfo_init); diff --git a/fs/proc/devices.c b/fs/proc/devices.c index 50493edc30e5..fe7bfcb7d049 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -1,20 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/blkdev.h> +#include "internal.h" static int devinfo_show(struct seq_file *f, void *v) { int i = *(loff_t *) v; - if (i < CHRDEV_MAJOR_HASH_SIZE) { + if (i < CHRDEV_MAJOR_MAX) { if (i == 0) seq_puts(f, "Character devices:\n"); chrdev_show(f, i); } #ifdef CONFIG_BLOCK else { - i -= CHRDEV_MAJOR_HASH_SIZE; + i -= CHRDEV_MAJOR_MAX; if (i == 0) seq_puts(f, "\nBlock devices:\n"); blkdev_show(f, i); @@ -25,7 +28,7 @@ static int devinfo_show(struct seq_file *f, void *v) static void *devinfo_start(struct seq_file *f, loff_t *pos) { - if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + if (*pos < (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX)) return pos; return NULL; } @@ -33,7 +36,7 @@ static void *devinfo_start(struct seq_file *f, loff_t *pos) static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos) { (*pos)++; - if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + if (*pos >= (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX)) return NULL; return pos; } @@ -50,21 +53,12 @@ static const struct seq_operations devinfo_ops = { .show = devinfo_show }; -static int devinfo_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &devinfo_ops); -} - -static const struct file_operations proc_devinfo_operations = { - .open = devinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_devices_init(void) { - proc_create("devices", 0, NULL, &proc_devinfo_operations); + struct proc_dir_entry *pde; + + pde = proc_create_seq("devices", 0, NULL, &devinfo_ops); + pde_make_permanent(pde); return 0; } fs_initcall(proc_devices_init); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index c330495c3115..9eeccff49b2a 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/sched/signal.h> #include <linux/errno.h> #include <linux/dcache.h> @@ -5,10 +6,13 @@ #include <linux/fdtable.h> #include <linux/namei.h> #include <linux/pid.h> +#include <linux/ptrace.h> +#include <linux/bitmap.h> #include <linux/security.h> #include <linux/file.h> #include <linux/seq_file.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/proc_fs.h> @@ -27,35 +31,35 @@ static int seq_show(struct seq_file *m, void *v) if (!task) return -ENOENT; - files = get_files_struct(task); - put_task_struct(task); - + task_lock(task); + files = task->files; if (files) { unsigned int fd = proc_fd(m->private); spin_lock(&files->file_lock); - file = fcheck_files(files, fd); + file = files_lookup_fd_locked(files, fd); if (file) { - struct fdtable *fdt = files_fdtable(files); - f_flags = file->f_flags; - if (close_on_exec(fd, fdt)) + if (close_on_exec(fd, files)) f_flags |= O_CLOEXEC; get_file(file); ret = 0; } spin_unlock(&files->file_lock); - put_files_struct(files); } + task_unlock(task); + put_task_struct(task); if (ret) return ret; - seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n", + seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%lu\n", (long long)file->f_pos, f_flags, - real_mount(file->f_path.mnt)->mnt_id); + real_mount(file->f_path.mnt)->mnt_id, + file_inode(file)->i_ino); + /* show_fd_locks() never dereferences files, so a stale value is safe */ show_fd_locks(m, file, files); if (seq_has_overflowed(m)) goto out; @@ -73,6 +77,34 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file) return single_open(file, seq_show, inode); } +/* + * Shared /proc/pid/fdinfo and /proc/pid/fdinfo/fd permission helper to ensure + * that the current task has PTRACE_MODE_READ in addition to the normal + * POSIX-like checks. + */ +static int proc_fdinfo_permission(struct mnt_idmap *idmap, struct inode *inode, + int mask) +{ + bool allowed = false; + struct task_struct *task = get_proc_task(inode); + + if (!task) + return -ESRCH; + + allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); + put_task_struct(task); + + if (!allowed) + return -EACCES; + + return generic_permission(idmap, inode, mask); +} + +static const struct inode_operations proc_fdinfo_file_inode_operations = { + .permission = proc_fdinfo_permission, + .setattr = proc_setattr, +}; + static const struct file_operations proc_fdinfo_file_operations = { .open = seq_fdinfo_open, .read = seq_read, @@ -80,9 +112,37 @@ static const struct file_operations proc_fdinfo_file_operations = { .release = single_release, }; -static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) +static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) +{ + struct file *file; + + file = fget_task(task, fd); + if (file) { + *mode = file->f_mode; + fput(file); + } + return !!file; +} + +static void tid_fd_update_inode(struct task_struct *task, struct inode *inode, + fmode_t f_mode) +{ + task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); + + if (S_ISLNK(inode->i_mode)) { + unsigned i_mode = S_IFLNK; + if (f_mode & FMODE_READ) + i_mode |= S_IRUSR | S_IXUSR; + if (f_mode & FMODE_WRITE) + i_mode |= S_IWUSR | S_IXUSR; + inode->i_mode = i_mode; + } + security_task_to_inode(task, inode); +} + +static int tid_fd_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { - struct files_struct *files; struct task_struct *task; struct inode *inode; unsigned int fd; @@ -95,35 +155,11 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) fd = proc_fd(inode); if (task) { - files = get_files_struct(task); - if (files) { - struct file *file; - - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - unsigned f_mode = file->f_mode; - - rcu_read_unlock(); - put_files_struct(files); - - task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); - - if (S_ISLNK(inode->i_mode)) { - unsigned i_mode = S_IFLNK; - if (f_mode & FMODE_READ) - i_mode |= S_IRUSR | S_IXUSR; - if (f_mode & FMODE_WRITE) - i_mode |= S_IWUSR | S_IXUSR; - inode->i_mode = i_mode; - } - - security_task_to_inode(task, inode); - put_task_struct(task); - return 1; - } - rcu_read_unlock(); - put_files_struct(files); + fmode_t f_mode; + if (tid_fd_mode(task, fd, &f_mode)) { + tid_fd_update_inode(task, inode, f_mode); + put_task_struct(task); + return 1; } put_task_struct(task); } @@ -137,62 +173,54 @@ static const struct dentry_operations tid_fd_dentry_operations = { static int proc_fd_link(struct dentry *dentry, struct path *path) { - struct files_struct *files = NULL; struct task_struct *task; int ret = -ENOENT; task = get_proc_task(d_inode(dentry)); if (task) { - files = get_files_struct(task); - put_task_struct(task); - } - - if (files) { unsigned int fd = proc_fd(d_inode(dentry)); struct file *fd_file; - spin_lock(&files->file_lock); - fd_file = fcheck_files(files, fd); + fd_file = fget_task(task, fd); if (fd_file) { *path = fd_file->f_path; path_get(&fd_file->f_path); ret = 0; + fput(fd_file); } - spin_unlock(&files->file_lock); - put_files_struct(files); + put_task_struct(task); } return ret; } -static int -proc_fd_instantiate(struct inode *dir, struct dentry *dentry, - struct task_struct *task, const void *ptr) +struct fd_data { + fmode_t mode; + unsigned fd; +}; + +static struct dentry *proc_fd_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) { - unsigned fd = (unsigned long)ptr; + const struct fd_data *data = ptr; struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK); + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK); if (!inode) - goto out; + return ERR_PTR(-ENOENT); ei = PROC_I(inode); - ei->fd = fd; + ei->fd = data->fd; inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; ei->op.proc_get_link = proc_fd_link; + tid_fd_update_inode(task, inode, data->mode); - d_set_d_op(dentry, &tid_fd_dentry_operations); - d_add(dentry, inode); - - /* Close the race of the process dying before we return the dentry */ - if (tid_fd_revalidate(dentry, 0)) - return 0; - out: - return -ENOENT; + return proc_splice_unmountable(inode, dentry, + &tid_fd_dentry_operations); } static struct dentry *proc_lookupfd_common(struct inode *dir, @@ -200,26 +228,27 @@ static struct dentry *proc_lookupfd_common(struct inode *dir, instantiate_t instantiate) { struct task_struct *task = get_proc_task(dir); - int result = -ENOENT; - unsigned fd = name_to_int(&dentry->d_name); + struct fd_data data = {.fd = name_to_int(&dentry->d_name)}; + struct dentry *result = ERR_PTR(-ENOENT); if (!task) goto out_no_task; - if (fd == ~0U) + if (data.fd == ~0U) + goto out; + if (!tid_fd_mode(task, data.fd, &data.mode)) goto out; - result = instantiate(dir, dentry, task, (void *)(unsigned long)fd); + result = instantiate(dentry, task, &data); out: put_task_struct(task); out_no_task: - return ERR_PTR(result); + return result; } static int proc_readfd_common(struct file *file, struct dir_context *ctx, instantiate_t instantiate) { struct task_struct *p = get_proc_task(file_inode(file)); - struct files_struct *files; unsigned int fd; if (!p) @@ -227,45 +256,65 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, if (!dir_emit_dots(file, ctx)) goto out; - files = get_files_struct(p); - if (!files) - goto out; - rcu_read_lock(); - for (fd = ctx->pos - 2; - fd < files_fdtable(files)->max_fds; - fd++, ctx->pos++) { - char name[PROC_NUMBUF]; - int len; - - if (!fcheck_files(files, fd)) - continue; - rcu_read_unlock(); + for (fd = ctx->pos - 2;; fd++) { + struct file *f; + struct fd_data data; + char name[10 + 1]; + unsigned int len; + + f = fget_task_next(p, &fd); + ctx->pos = fd + 2LL; + if (!f) + break; + data.mode = f->f_mode; + fput(f); + data.fd = fd; len = snprintf(name, sizeof(name), "%u", fd); if (!proc_fill_cache(file, ctx, name, len, instantiate, p, - (void *)(unsigned long)fd)) - goto out_fd_loop; + &data)) + break; cond_resched(); - rcu_read_lock(); } - rcu_read_unlock(); -out_fd_loop: - put_files_struct(files); out: put_task_struct(p); return 0; } -static int proc_readfd(struct file *file, struct dir_context *ctx) +static int proc_readfd_count(struct inode *inode, loff_t *count) +{ + struct task_struct *p = get_proc_task(inode); + struct fdtable *fdt; + + if (!p) + return -ENOENT; + + task_lock(p); + if (p->files) { + rcu_read_lock(); + + fdt = files_fdtable(p->files); + *count = bitmap_weight(fdt->open_fds, fdt->max_fds); + + rcu_read_unlock(); + } + task_unlock(p); + + put_task_struct(p); + + return 0; +} + +static int proc_fd_iterate(struct file *file, struct dir_context *ctx) { return proc_readfd_common(file, ctx, proc_fd_instantiate); } const struct file_operations proc_fd_operations = { .read = generic_read_dir, - .iterate_shared = proc_readfd, + .iterate_shared = proc_fd_iterate, .llseek = generic_file_llseek, }; @@ -279,12 +328,13 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). */ -int proc_fd_permission(struct inode *inode, int mask) +int proc_fd_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask) { struct task_struct *p; int rv; - rv = generic_permission(inode, mask); + rv = generic_permission(&nop_mnt_idmap, inode, mask); if (rv == 0) return rv; @@ -297,37 +347,44 @@ int proc_fd_permission(struct inode *inode, int mask) return rv; } +static int proc_fd_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + return proc_readfd_count(inode, &stat->size); +} + const struct inode_operations proc_fd_inode_operations = { .lookup = proc_lookupfd, .permission = proc_fd_permission, + .getattr = proc_fd_getattr, .setattr = proc_setattr, }; -static int -proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, - struct task_struct *task, const void *ptr) +static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) { - unsigned fd = (unsigned long)ptr; + const struct fd_data *data = ptr; struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task, S_IFREG | S_IRUSR); + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUGO); if (!inode) - goto out; + return ERR_PTR(-ENOENT); ei = PROC_I(inode); - ei->fd = fd; + ei->fd = data->fd; - inode->i_fop = &proc_fdinfo_file_operations; + inode->i_op = &proc_fdinfo_file_inode_operations; - d_set_d_op(dentry, &tid_fd_dentry_operations); - d_add(dentry, inode); + inode->i_fop = &proc_fdinfo_file_operations; + tid_fd_update_inode(task, inode, 0); - /* Close the race of the process dying before we return the dentry */ - if (tid_fd_revalidate(dentry, 0)) - return 0; - out: - return -ENOENT; + return proc_splice_unmountable(inode, dentry, + &tid_fd_dentry_operations); } static struct dentry * @@ -336,7 +393,7 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags) return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); } -static int proc_readfdinfo(struct file *file, struct dir_context *ctx) +static int proc_fdinfo_iterate(struct file *file, struct dir_context *ctx) { return proc_readfd_common(file, ctx, proc_fdinfo_instantiate); @@ -344,11 +401,12 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx) const struct inode_operations proc_fdinfo_inode_operations = { .lookup = proc_lookupfdinfo, + .permission = proc_fdinfo_permission, .setattr = proc_setattr, }; const struct file_operations proc_fdinfo_operations = { .read = generic_read_dir, - .iterate_shared = proc_readfdinfo, + .iterate_shared = proc_fdinfo_iterate, .llseek = generic_file_llseek, }; diff --git a/fs/proc/fd.h b/fs/proc/fd.h index 46dafadd0083..7e7265f7e06f 100644 --- a/fs/proc/fd.h +++ b/fs/proc/fd.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __PROCFS_FD_H__ #define __PROCFS_FD_H__ @@ -9,7 +10,8 @@ extern const struct inode_operations proc_fd_inode_operations; extern const struct file_operations proc_fdinfo_operations; extern const struct inode_operations proc_fdinfo_inode_operations; -extern int proc_fd_permission(struct inode *inode, int mask); +extern int proc_fd_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask); static inline unsigned int proc_fd(struct inode *inode) { diff --git a/fs/proc/generic.c b/fs/proc/generic.c index e3cda0b5968f..501889856461 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * proc/fs/generic.c --- generic routines for the proc-fs * @@ -8,12 +9,14 @@ * Copyright (C) 1997 Theodore Ts'o */ +#include <linux/cache.h> #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/namei.h> #include <linux/slab.h> #include <linux/printk.h> #include <linux/mount.h> @@ -23,12 +26,24 @@ #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/uaccess.h> +#include <linux/seq_file.h> #include "internal.h" static DEFINE_RWLOCK(proc_subdir_lock); -static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) +struct kmem_cache *proc_dir_entry_cache __ro_after_init; + +void pde_free(struct proc_dir_entry *pde) +{ + if (S_ISLNK(pde->mode)) + kfree(pde->data); + if (pde->name != pde->inline_name) + kfree(pde->name); + kmem_cache_free(proc_dir_entry_cache, pde); +} + +static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len) { if (len < de->namelen) return -1; @@ -60,7 +75,7 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, struct proc_dir_entry *de = rb_entry(node, struct proc_dir_entry, subdir_node); - int result = proc_match(len, name, de); + int result = proc_match(name, de, len); if (result < 0) node = node->rb_left; @@ -83,7 +98,7 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, struct proc_dir_entry *this = rb_entry(*new, struct proc_dir_entry, subdir_node); - int result = proc_match(de->namelen, de->name, this); + int result = proc_match(de->name, this, de->namelen); parent = *new; if (result < 0) @@ -100,33 +115,38 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, return true; } -static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) +static int proc_notify_change(struct mnt_idmap *idmap, + struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct proc_dir_entry *de = PDE(inode); int error; - error = setattr_prepare(dentry, iattr); + error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (error) return error; - setattr_copy(inode, iattr); - mark_inode_dirty(inode); + setattr_copy(&nop_mnt_idmap, inode, iattr); proc_set_user(de, inode->i_uid, inode->i_gid); de->mode = inode->i_mode; return 0; } -static int proc_getattr(const struct path *path, struct kstat *stat, +static int proc_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct proc_dir_entry *de = PDE(inode); - if (de && de->nlink) - set_nlink(inode, de->nlink); + if (de) { + nlink_t nlink = READ_ONCE(de->nlink); + if (nlink > 0) { + set_nlink(inode, nlink); + } + } - generic_fillattr(inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } @@ -144,24 +164,15 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, { const char *cp = name, *next; struct proc_dir_entry *de; - unsigned int len; - de = *ret; - if (!de) - de = &proc_root; - - while (1) { - next = strchr(cp, '/'); - if (!next) - break; - - len = next - cp; - de = pde_subdir_find(de, cp, len); + de = *ret ?: &proc_root; + while ((next = strchr(cp, '/')) != NULL) { + de = pde_subdir_find(de, cp, next - cp); if (!de) { WARN(1, "name '%s'\n", name); return -ENOENT; } - cp += len + 1; + cp = next + 1; } *residual = cp; *ret = de; @@ -191,8 +202,8 @@ int proc_alloc_inum(unsigned int *inum) { int i; - i = ida_simple_get(&proc_inum_ida, 0, UINT_MAX - PROC_DYNAMIC_FIRST + 1, - GFP_KERNEL); + i = ida_alloc_max(&proc_inum_ida, UINT_MAX - PROC_DYNAMIC_FIRST, + GFP_KERNEL); if (i < 0) return i; @@ -202,15 +213,36 @@ int proc_alloc_inum(unsigned int *inum) void proc_free_inum(unsigned int inum) { - ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); + ida_free(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); +} + +static int proc_misc_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) +{ + if (flags & LOOKUP_RCU) + return -ECHILD; + + if (atomic_read(&PDE(d_inode(dentry))->in_use) < 0) + return 0; /* revalidate */ + return 1; +} + +static int proc_misc_d_delete(const struct dentry *dentry) +{ + return atomic_read(&PDE(d_inode(dentry))->in_use) < 0; } +static const struct dentry_operations proc_misc_dentry_ops = { + .d_revalidate = proc_misc_d_revalidate, + .d_delete = proc_misc_d_delete, +}; + /* * Don't create negative dentries here, return -ENOENT by hand * instead. */ -struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, - struct dentry *dentry) +struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, + struct proc_dir_entry *de) { struct inode *inode; @@ -222,9 +254,11 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, &simple_dentry_operations); - d_add(dentry, inode); - return NULL; + if (de->flags & PROC_ENTRY_FORCE_LOOKUP) + return d_splice_alias_ops(inode, dentry, + &proc_net_dentry_ops); + return d_splice_alias_ops(inode, dentry, + &proc_misc_dentry_ops); } read_unlock(&proc_subdir_lock); return ERR_PTR(-ENOENT); @@ -233,7 +267,12 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - return proc_lookup_de(PDE(dir), dir, dentry); + struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb); + + if (fs_info->pidonly == PROC_PIDONLY_ON) + return ERR_PTR(-ENOENT); + + return proc_lookup_de(dir, dentry, PDE(dir)); } /* @@ -245,17 +284,17 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, * value of the readdir() call, as long as it's non-negative * for success.. */ -int proc_readdir_de(struct proc_dir_entry *de, struct file *file, - struct dir_context *ctx) +int proc_readdir_de(struct file *file, struct dir_context *ctx, + struct proc_dir_entry *de) { int i; if (!dir_emit_dots(file, ctx)) return 0; + i = ctx->pos - 2; read_lock(&proc_subdir_lock); de = pde_subdir_first(de); - i = ctx->pos - 2; for (;;) { if (!de) { read_unlock(&proc_subdir_lock); @@ -276,8 +315,8 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, pde_put(de); return 0; } - read_lock(&proc_subdir_lock); ctx->pos++; + read_lock(&proc_subdir_lock); next = pde_subdir_next(de); pde_put(de); de = next; @@ -289,8 +328,12 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, int proc_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); + struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); + + if (fs_info->pidonly == PROC_PIDONLY_ON) + return 1; - return proc_readdir_de(PDE(inode), file, ctx); + return proc_readdir_de(file, ctx, PDE(inode)); } /* @@ -304,6 +347,17 @@ static const struct file_operations proc_dir_operations = { .iterate_shared = proc_readdir, }; +static int proc_net_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) +{ + return 0; +} + +const struct dentry_operations proc_net_dentry_ops = { + .d_revalidate = proc_net_d_revalidate, + .d_delete = always_delete_dentry, +}; + /* * proc directories can do almost nothing.. */ @@ -313,13 +367,34 @@ static const struct inode_operations proc_dir_inode_operations = { .setattr = proc_notify_change, }; -static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) +static void pde_set_flags(struct proc_dir_entry *pde) { - int ret; + const struct proc_ops *proc_ops = pde->proc_ops; - ret = proc_alloc_inum(&dp->low_ino); - if (ret) - return ret; + if (!proc_ops) + return; + + if (proc_ops->proc_flags & PROC_ENTRY_PERMANENT) + pde->flags |= PROC_ENTRY_PERMANENT; + if (proc_ops->proc_read_iter) + pde->flags |= PROC_ENTRY_proc_read_iter; +#ifdef CONFIG_COMPAT + if (proc_ops->proc_compat_ioctl) + pde->flags |= PROC_ENTRY_proc_compat_ioctl; +#endif + if (proc_ops->proc_lseek) + pde->flags |= PROC_ENTRY_proc_lseek; +} + +/* returns the registered entry, or frees dp and returns NULL on failure */ +struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, + struct proc_dir_entry *dp) +{ + if (proc_alloc_inum(&dp->low_ino)) + goto out_free_entry; + + if (!S_ISDIR(dp->mode)) + pde_set_flags(dp); write_lock(&proc_subdir_lock); dp->parent = dir; @@ -327,12 +402,17 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp WARN(1, "proc_dir_entry '%s/%s' already registered\n", dir->name, dp->name); write_unlock(&proc_subdir_lock); - proc_free_inum(dp->low_ino); - return -EEXIST; + goto out_free_inum; } + dir->nlink++; write_unlock(&proc_subdir_lock); - return 0; + return dp; +out_free_inum: + proc_free_inum(dp->low_ino); +out_free_entry: + pde_free(dp); + return NULL; } static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, @@ -352,6 +432,14 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, WARN(1, "name len %u\n", qstr.len); return NULL; } + if (qstr.len == 1 && fn[0] == '.') { + WARN(1, "name '.'\n"); + return NULL; + } + if (qstr.len == 2 && fn[0] == '.' && fn[1] == '.') { + WARN(1, "name '..'\n"); + return NULL; + } if (*parent == &proc_root && name_to_int(&qstr) != ~0U) { WARN(1, "create '/proc/%s' by hand\n", qstr.name); return NULL; @@ -361,20 +449,34 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, return NULL; } - ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL); + ent = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!ent) goto out; + if (qstr.len + 1 <= SIZEOF_PDE_INLINE_NAME) { + ent->name = ent->inline_name; + } else { + ent->name = kmalloc(qstr.len + 1, GFP_KERNEL); + if (!ent->name) { + pde_free(ent); + return NULL; + } + } + memcpy(ent->name, fn, qstr.len + 1); ent->namelen = qstr.len; ent->mode = mode; ent->nlink = nlink; ent->subdir = RB_ROOT; - atomic_set(&ent->count, 1); + refcount_set(&ent->refcnt, 1); spin_lock_init(&ent->pde_unload_lock); INIT_LIST_HEAD(&ent->pde_openers); proc_set_user(ent, (*parent)->uid, (*parent)->gid); + /* Revalidate everything under /proc/${pid}/net */ + if ((*parent)->flags & PROC_ENTRY_FORCE_LOOKUP) + pde_force_lookup(ent); + out: return ent; } @@ -388,17 +490,13 @@ struct proc_dir_entry *proc_symlink(const char *name, (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1); if (ent) { - ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL); + ent->size = strlen(dest); + ent->data = kmemdup(dest, ent->size + 1, GFP_KERNEL); if (ent->data) { - strcpy((char*)ent->data,dest); ent->proc_iops = &proc_link_inode_operations; - if (proc_register(parent, ent) < 0) { - kfree(ent->data); - kfree(ent); - ent = NULL; - } + ent = proc_register(parent, ent); } else { - kfree(ent); + pde_free(ent); ent = NULL; } } @@ -406,8 +504,8 @@ struct proc_dir_entry *proc_symlink(const char *name, } EXPORT_SYMBOL(proc_symlink); -struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, - struct proc_dir_entry *parent, void *data) +struct proc_dir_entry *_proc_mkdir(const char *name, umode_t mode, + struct proc_dir_entry *parent, void *data, bool force_lookup) { struct proc_dir_entry *ent; @@ -417,17 +515,22 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, ent = __proc_create(&parent, name, S_IFDIR | mode, 2); if (ent) { ent->data = data; - ent->proc_fops = &proc_dir_operations; + ent->proc_dir_ops = &proc_dir_operations; ent->proc_iops = &proc_dir_inode_operations; - parent->nlink++; - if (proc_register(parent, ent) < 0) { - kfree(ent); - parent->nlink--; - ent = NULL; + if (force_lookup) { + pde_force_lookup(ent); } + ent = proc_register(parent, ent); } return ent; } +EXPORT_SYMBOL_GPL(_proc_mkdir); + +struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, void *data) +{ + return _proc_mkdir(name, mode, parent, data, false); +} EXPORT_SYMBOL_GPL(proc_mkdir_data); struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, @@ -452,53 +555,128 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent = __proc_create(&parent, name, mode, 2); if (ent) { ent->data = NULL; - ent->proc_fops = NULL; + ent->proc_dir_ops = NULL; ent->proc_iops = NULL; - parent->nlink++; - if (proc_register(parent, ent) < 0) { - kfree(ent); - parent->nlink--; - ent = NULL; - } + ent = proc_register(parent, ent); } return ent; } EXPORT_SYMBOL(proc_create_mount_point); -struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, - struct proc_dir_entry *parent, - const struct file_operations *proc_fops, - void *data) +struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, + struct proc_dir_entry **parent, void *data) { - struct proc_dir_entry *pde; + struct proc_dir_entry *p; + if ((mode & S_IFMT) == 0) mode |= S_IFREG; - - if (!S_ISREG(mode)) { - WARN_ON(1); /* use proc_mkdir() */ + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO; + if (WARN_ON_ONCE(!S_ISREG(mode))) return NULL; + + p = __proc_create(parent, name, mode, 1); + if (p) { + p->proc_iops = &proc_file_inode_operations; + p->data = data; } + return p; +} - BUG_ON(proc_fops == NULL); +struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, + const struct proc_ops *proc_ops, void *data) +{ + struct proc_dir_entry *p; - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO; - pde = __proc_create(&parent, name, mode, 1); - if (!pde) - goto out; - pde->proc_fops = proc_fops; - pde->data = data; - pde->proc_iops = &proc_file_inode_operations; - if (proc_register(parent, pde) < 0) - goto out_free; - return pde; -out_free: - kfree(pde); -out: - return NULL; + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_ops = proc_ops; + return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_data); +struct proc_dir_entry *proc_create(const char *name, umode_t mode, + struct proc_dir_entry *parent, + const struct proc_ops *proc_ops) +{ + return proc_create_data(name, mode, parent, proc_ops, NULL); +} +EXPORT_SYMBOL(proc_create); + +static int proc_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *de = PDE(inode); + + if (de->state_size) + return seq_open_private(file, de->seq_ops, de->state_size); + return seq_open(file, de->seq_ops); +} + +static int proc_seq_release(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *de = PDE(inode); + + if (de->state_size) + return seq_release_private(inode, file); + return seq_release(inode, file); +} + +static const struct proc_ops proc_seq_ops = { + /* not permanent -- can call into arbitrary seq_operations */ + .proc_open = proc_seq_open, + .proc_read_iter = seq_read_iter, + .proc_lseek = seq_lseek, + .proc_release = proc_seq_release, +}; + +struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, + struct proc_dir_entry *parent, const struct seq_operations *ops, + unsigned int state_size, void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_ops = &proc_seq_ops; + p->seq_ops = ops; + p->state_size = state_size; + return proc_register(parent, p); +} +EXPORT_SYMBOL(proc_create_seq_private); + +static int proc_single_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *de = PDE(inode); + + return single_open(file, de->single_show, de->data); +} + +static const struct proc_ops proc_single_ops = { + /* not permanent -- can call into arbitrary ->single_show */ + .proc_open = proc_single_open, + .proc_read_iter = seq_read_iter, + .proc_lseek = seq_lseek, + .proc_release = single_release, +}; + +struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, + int (*show)(struct seq_file *, void *), void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_ops = &proc_single_ops; + p->single_show = show; + return proc_register(parent, p); +} +EXPORT_SYMBOL(proc_create_single_data); + void proc_set_size(struct proc_dir_entry *de, loff_t size) { de->size = size; @@ -512,19 +690,18 @@ void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) } EXPORT_SYMBOL(proc_set_user); -static void free_proc_entry(struct proc_dir_entry *de) +void pde_put(struct proc_dir_entry *pde) { - proc_free_inum(de->low_ino); - - if (S_ISLNK(de->mode)) - kfree(de->data); - kfree(de); + if (refcount_dec_and_test(&pde->refcnt)) { + proc_free_inum(pde->low_ino); + pde_free(pde); + } } -void pde_put(struct proc_dir_entry *pde) +static void pde_erase(struct proc_dir_entry *pde, struct proc_dir_entry *parent) { - if (atomic_dec_and_test(&pde->count)) - free_proc_entry(pde); + rb_erase(&pde->subdir_node, &parent->subdir); + RB_CLEAR_NODE(&pde->subdir_node); } /* @@ -544,8 +721,16 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) len = strlen(fn); de = pde_subdir_find(parent, fn, len); - if (de) - rb_erase(&de->subdir_node, &parent->subdir); + if (de) { + if (unlikely(pde_is_permanent(de))) { + WARN(1, "removing permanent /proc entry '%s'", de->name); + de = NULL; + } else { + pde_erase(de, parent); + if (S_ISDIR(de->mode)) + parent->nlink--; + } + } write_unlock(&proc_subdir_lock); if (!de) { WARN(1, "name '%s'\n", name); @@ -554,9 +739,6 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) proc_entry_rundown(de); - if (S_ISDIR(de->mode)) - parent->nlink--; - de->nlink = 0; WARN(pde_subdir_first(de), "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n", __func__, de->parent->name, de->name, pde_subdir_first(de)->name); @@ -582,23 +764,34 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) write_unlock(&proc_subdir_lock); return -ENOENT; } - rb_erase(&root->subdir_node, &parent->subdir); + if (unlikely(pde_is_permanent(root))) { + write_unlock(&proc_subdir_lock); + WARN(1, "removing permanent /proc entry '%s/%s'", + root->parent->name, root->name); + return -EINVAL; + } + pde_erase(root, parent); de = root; while (1) { next = pde_subdir_first(de); if (next) { - rb_erase(&next->subdir_node, &de->subdir); + if (unlikely(pde_is_permanent(next))) { + write_unlock(&proc_subdir_lock); + WARN(1, "removing permanent /proc entry '%s/%s'", + next->parent->name, next->name); + return -EINVAL; + } + pde_erase(next, de); de = next; continue; } - write_unlock(&proc_subdir_lock); - - proc_entry_rundown(de); next = de->parent; if (S_ISDIR(de->mode)) next->nlink--; - de->nlink = 0; + write_unlock(&proc_subdir_lock); + + proc_entry_rundown(de); if (de == root) break; pde_put(de); @@ -625,8 +818,26 @@ void proc_remove(struct proc_dir_entry *de) } EXPORT_SYMBOL(proc_remove); -void *PDE_DATA(const struct inode *inode) +/* + * Pull a user buffer into memory and pass it to the file's write handler if + * one is supplied. The ->write() method is permitted to modify the + * kernel-side buffer. + */ +ssize_t proc_simple_write(struct file *f, const char __user *ubuf, size_t size, + loff_t *_pos) { - return __PDE_DATA(inode); + struct proc_dir_entry *pde = PDE(file_inode(f)); + char *buf; + int ret; + + if (!pde->write) + return -EACCES; + if (size == 0 || size > PAGE_SIZE - 1) + return -EINVAL; + buf = memdup_user_nul(ubuf, size); + if (IS_ERR(buf)) + return PTR_ERR(buf); + ret = pde->write(f, buf, size); + kfree(buf); + return ret == 0 ? size : ret; } -EXPORT_SYMBOL(PDE_DATA); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index e250910cffc8..b7634f975d98 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -1,9 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds */ +#include <linux/cache.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/kernel.h> @@ -22,43 +24,37 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/mount.h> -#include <linux/magic.h> - -#include <linux/uaccess.h> +#include <linux/bug.h> #include "internal.h" static void proc_evict_inode(struct inode *inode) { - struct proc_dir_entry *de; struct ctl_table_header *head; + struct proc_inode *ei = PROC_I(inode); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); /* Stop tracking associated processes */ - put_pid(PROC_I(inode)->pid); + if (ei->pid) + proc_pid_evict_inode(ei); - /* Let go of any associated proc directory entry */ - de = PDE(inode); - if (de) - pde_put(de); - - head = PROC_I(inode)->sysctl; + head = ei->sysctl; if (head) { - RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); + WRITE_ONCE(ei->sysctl, NULL); proc_sys_evict_inode(inode, head); } } -static struct kmem_cache * proc_inode_cachep; +static struct kmem_cache *proc_inode_cachep __ro_after_init; +static struct kmem_cache *pde_opener_cache __ro_after_init; static struct inode *proc_alloc_inode(struct super_block *sb) { struct proc_inode *ei; - struct inode *inode; - ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, proc_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->pid = NULL; @@ -67,20 +63,21 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei->pde = NULL; ei->sysctl = NULL; ei->sysctl_entry = NULL; + INIT_HLIST_NODE(&ei->sibling_inodes); ei->ns_ops = NULL; - inode = &ei->vfs_inode; - return inode; + return &ei->vfs_inode; } -static void proc_i_callback(struct rcu_head *head) +static void proc_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(proc_inode_cachep, PROC_I(inode)); -} + struct proc_inode *ei = PROC_I(inode); -static void proc_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, proc_i_callback); + if (ei->pid) + put_pid(ei->pid); + /* Let go of any associated proc directory entry */ + if (ei->pde) + pde_put(ei->pde); + kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } static void init_once(void *foo) @@ -90,36 +87,109 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -void __init proc_init_inodecache(void) +void __init proc_init_kmemcache(void) { proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT| + SLAB_ACCOUNT| SLAB_PANIC), init_once); + pde_opener_cache = + kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0, + SLAB_ACCOUNT|SLAB_PANIC, NULL); + proc_dir_entry_cache = kmem_cache_create_usercopy( + "proc_dir_entry", SIZEOF_PDE, 0, SLAB_PANIC, + offsetof(struct proc_dir_entry, inline_name), + SIZEOF_PDE_INLINE_NAME, NULL); + BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE); +} + +void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock) +{ + struct hlist_node *node; + struct super_block *old_sb = NULL; + + rcu_read_lock(); + while ((node = hlist_first_rcu(inodes))) { + struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes); + struct super_block *sb; + struct inode *inode; + + spin_lock(lock); + hlist_del_init_rcu(&ei->sibling_inodes); + spin_unlock(lock); + + inode = &ei->vfs_inode; + sb = inode->i_sb; + if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active)) + continue; + inode = igrab(inode); + rcu_read_unlock(); + if (sb != old_sb) { + if (old_sb) + deactivate_super(old_sb); + old_sb = sb; + } + if (unlikely(!inode)) { + rcu_read_lock(); + continue; + } + + if (S_ISDIR(inode->i_mode)) { + struct dentry *dir = d_find_any_alias(inode); + if (dir) { + d_invalidate(dir); + dput(dir); + } + } else { + struct dentry *dentry; + while ((dentry = d_find_alias(inode))) { + d_invalidate(dentry); + dput(dentry); + } + } + iput(inode); + + rcu_read_lock(); + } + rcu_read_unlock(); + if (old_sb) + deactivate_super(old_sb); +} + +static inline const char *hidepid2str(enum proc_hidepid v) +{ + switch (v) { + case HIDEPID_OFF: return "off"; + case HIDEPID_NO_ACCESS: return "noaccess"; + case HIDEPID_INVISIBLE: return "invisible"; + case HIDEPID_NOT_PTRACEABLE: return "ptraceable"; + } + WARN_ONCE(1, "bad hide_pid value: %d\n", v); + return "unknown"; } static int proc_show_options(struct seq_file *seq, struct dentry *root) { - struct super_block *sb = root->d_sb; - struct pid_namespace *pid = sb->s_fs_info; + struct proc_fs_info *fs_info = proc_sb_info(root->d_sb); - if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID)) - seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid)); - if (pid->hide_pid != HIDEPID_OFF) - seq_printf(seq, ",hidepid=%u", pid->hide_pid); + if (!gid_eq(fs_info->pid_gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, fs_info->pid_gid)); + if (fs_info->hide_pid != HIDEPID_OFF) + seq_printf(seq, ",hidepid=%s", hidepid2str(fs_info->hide_pid)); + if (fs_info->pidonly != PROC_PIDONLY_OFF) + seq_printf(seq, ",subset=pid"); return 0; } -static const struct super_operations proc_sops = { +const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, - .destroy_inode = proc_destroy_inode, - .drop_inode = generic_delete_inode, + .free_inode = proc_free_inode, + .drop_inode = inode_just_drop, .evict_inode = proc_evict_inode, .statfs = simple_statfs, - .remount_fs = proc_remount, .show_options = proc_show_options, }; @@ -127,17 +197,26 @@ enum {BIAS = -1U<<31}; static inline int use_pde(struct proc_dir_entry *pde) { - return atomic_inc_unless_negative(&pde->in_use); + return likely(atomic_inc_unless_negative(&pde->in_use)); } static void unuse_pde(struct proc_dir_entry *pde) { - if (atomic_dec_return(&pde->in_use) == BIAS) + if (unlikely(atomic_dec_return(&pde->in_use) == BIAS)) complete(pde->pde_unload_completion); } -/* pde is locked */ +/* + * At most 2 contexts can enter this function: the one doing the last + * close on the descriptor and whoever is deleting PDE itself. + * + * First to enter calls ->proc_release hook and signals its completion + * to the second one which waits and then does nothing. + * + * PDE is locked on entry, unlocked on exit. + */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) + __releases(&pde->pde_unload_lock) { /* * close() (proc_reg_release()) can't delete an entry and proceed: @@ -145,9 +224,6 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) * * rmmod (remove_proc_entry() et al) can't delete an entry and proceed: * "struct file" needs to be available at the right moment. - * - * Therefore, first process to enter this function does ->release() and - * signals its completion to the other process which does nothing. */ if (pdeo->closing) { /* somebody else is doing that, just wait */ @@ -155,19 +231,24 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) pdeo->c = &c; spin_unlock(&pde->pde_unload_lock); wait_for_completion(&c); - spin_lock(&pde->pde_unload_lock); } else { struct file *file; + struct completion *c; + pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); + file = pdeo->file; - pde->proc_fops->release(file_inode(file), file); + pde->proc_ops->proc_release(file_inode(file), file); + spin_lock(&pde->pde_unload_lock); - /* After ->release. */ + /* Strictly after ->proc_release, see above. */ list_del(&pdeo->lh); - if (pdeo->c) - complete(pdeo->c); - kfree(pdeo); + c = pdeo->c; + spin_unlock(&pde->pde_unload_lock); + if (unlikely(c)) + complete(c); + kmem_cache_free(pde_opener_cache, pdeo); } } @@ -186,6 +267,7 @@ void proc_entry_rundown(struct proc_dir_entry *de) struct pde_opener *pdeo; pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); close_pdeo(de, pdeo); + spin_lock(&de->pde_unload_lock); } spin_unlock(&de->pde_unload_lock); } @@ -194,124 +276,191 @@ static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) { struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; - if (use_pde(pde)) { - loff_t (*llseek)(struct file *, loff_t, int); - llseek = pde->proc_fops->llseek; - if (!llseek) - llseek = default_llseek; - rv = llseek(file, offset, whence); + + if (pde_is_permanent(pde)) { + return pde->proc_ops->proc_lseek(file, offset, whence); + } else if (use_pde(pde)) { + rv = pde->proc_ops->proc_lseek(file, offset, whence); unuse_pde(pde); } return rv; } +static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct proc_dir_entry *pde = PDE(file_inode(iocb->ki_filp)); + ssize_t ret; + + if (pde_is_permanent(pde)) + return pde->proc_ops->proc_read_iter(iocb, iter); + + if (!use_pde(pde)) + return -EIO; + ret = pde->proc_ops->proc_read_iter(iocb, iter); + unuse_pde(pde); + return ret; +} + +static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + const auto read = pde->proc_ops->proc_read; + if (read) + return read(file, buf, count, ppos); + return -EIO; +} + static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; - if (use_pde(pde)) { - read = pde->proc_fops->read; - if (read) - rv = read(file, buf, count, ppos); + + if (pde_is_permanent(pde)) { + return pde_read(pde, file, buf, count, ppos); + } else if (use_pde(pde)) { + rv = pde_read(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } +static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + const auto write = pde->proc_ops->proc_write; + if (write) + return write(file, buf, count, ppos); + return -EIO; +} + static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; - if (use_pde(pde)) { - write = pde->proc_fops->write; - if (write) - rv = write(file, buf, count, ppos); + + if (pde_is_permanent(pde)) { + return pde_write(pde, file, buf, count, ppos); + } else if (use_pde(pde)) { + rv = pde_write(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } -static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts) +static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts) +{ + const auto poll = pde->proc_ops->proc_poll; + if (poll) + return poll(file, pts); + return DEFAULT_POLLMASK; +} + +static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) { struct proc_dir_entry *pde = PDE(file_inode(file)); - unsigned int rv = DEFAULT_POLLMASK; - unsigned int (*poll)(struct file *, struct poll_table_struct *); - if (use_pde(pde)) { - poll = pde->proc_fops->poll; - if (poll) - rv = poll(file, pts); + __poll_t rv = DEFAULT_POLLMASK; + + if (pde_is_permanent(pde)) { + return pde_poll(pde, file, pts); + } else if (use_pde(pde)) { + rv = pde_poll(pde, file, pts); unuse_pde(pde); } return rv; } +static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) +{ + const auto ioctl = pde->proc_ops->proc_ioctl; + if (ioctl) + return ioctl(file, cmd, arg); + return -ENOTTY; +} + static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; - long (*ioctl)(struct file *, unsigned int, unsigned long); - if (use_pde(pde)) { - ioctl = pde->proc_fops->unlocked_ioctl; - if (ioctl) - rv = ioctl(file, cmd, arg); + + if (pde_is_permanent(pde)) { + return pde_ioctl(pde, file, cmd, arg); + } else if (use_pde(pde)) { + rv = pde_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #ifdef CONFIG_COMPAT +static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) +{ + const auto compat_ioctl = pde->proc_ops->proc_compat_ioctl; + if (compat_ioctl) + return compat_ioctl(file, cmd, arg); + return -ENOTTY; +} + static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; - long (*compat_ioctl)(struct file *, unsigned int, unsigned long); - if (use_pde(pde)) { - compat_ioctl = pde->proc_fops->compat_ioctl; - if (compat_ioctl) - rv = compat_ioctl(file, cmd, arg); + if (pde_is_permanent(pde)) { + return pde_compat_ioctl(pde, file, cmd, arg); + } else if (use_pde(pde)) { + rv = pde_compat_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #endif +static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma) +{ + const auto mmap = pde->proc_ops->proc_mmap; + if (mmap) + return mmap(file, vma); + return -EIO; +} + static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) { struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; - int (*mmap)(struct file *, struct vm_area_struct *); - if (use_pde(pde)) { - mmap = pde->proc_fops->mmap; - if (mmap) - rv = mmap(file, vma); + + if (pde_is_permanent(pde)) { + return pde_mmap(pde, file, vma); + } else if (use_pde(pde)) { + rv = pde_mmap(pde, file, vma); unuse_pde(pde); } return rv; } static unsigned long -proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, +pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - struct proc_dir_entry *pde = PDE(file_inode(file)); - unsigned long rv = -EIO; - - if (use_pde(pde)) { - typeof(proc_reg_get_unmapped_area) *get_area; + if (pde->proc_ops->proc_get_unmapped_area) + return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags); - get_area = pde->proc_fops->get_unmapped_area; #ifdef CONFIG_MMU - if (!get_area) - get_area = current->mm->get_unmapped_area; + return mm_get_unmapped_area(file, orig_addr, len, pgoff, flags); #endif - if (get_area) - rv = get_area(file, orig_addr, len, pgoff, flags); - else - rv = orig_addr; + return orig_addr; +} + +static unsigned long +proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + unsigned long rv = -EIO; + + if (pde_is_permanent(pde)) { + return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); + } else if (use_pde(pde)) { + rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); unuse_pde(pde); } return rv; @@ -321,10 +470,19 @@ static int proc_reg_open(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); int rv = 0; - int (*open)(struct inode *, struct file *); - int (*release)(struct inode *, struct file *); + typeof_member(struct proc_ops, proc_open) open; struct pde_opener *pdeo; + if (!pde_has_proc_lseek(pde)) + file->f_mode &= ~FMODE_LSEEK; + + if (pde_is_permanent(pde)) { + open = pde->proc_ops->proc_open; + if (open) + rv = open(inode, file); + return rv; + } + /* * Ensure that * 1) PDE's ->release hook will be called no matter what @@ -336,31 +494,36 @@ static int proc_reg_open(struct inode *inode, struct file *file) * * Save every "struct file" with custom ->release hook. */ - pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); - if (!pdeo) - return -ENOMEM; - - if (!use_pde(pde)) { - kfree(pdeo); + if (!use_pde(pde)) return -ENOENT; + + const auto release = pde->proc_ops->proc_release; + if (release) { + pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL); + if (!pdeo) { + rv = -ENOMEM; + goto out_unuse; + } } - open = pde->proc_fops->open; - release = pde->proc_fops->release; + open = pde->proc_ops->proc_open; if (open) rv = open(inode, file); - if (rv == 0 && release) { - /* To know what to release. */ - pdeo->file = file; - pdeo->closing = false; - pdeo->c = NULL; - spin_lock(&pde->pde_unload_lock); - list_add(&pdeo->lh, &pde->pde_openers); - spin_unlock(&pde->pde_unload_lock); - } else - kfree(pdeo); + if (release) { + if (rv == 0) { + /* To know what to release. */ + pdeo->file = file; + pdeo->closing = false; + pdeo->c = NULL; + spin_lock(&pde->pde_unload_lock); + list_add(&pdeo->lh, &pde->pde_openers); + spin_unlock(&pde->pde_unload_lock); + } else + kmem_cache_free(pde_opener_cache, pdeo); + } +out_unuse: unuse_pde(pde); return rv; } @@ -369,11 +532,19 @@ static int proc_reg_release(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); struct pde_opener *pdeo; + + if (pde_is_permanent(pde)) { + const auto release = pde->proc_ops->proc_release; + if (release) + return release(inode, file); + return 0; + } + spin_lock(&pde->pde_unload_lock); list_for_each_entry(pdeo, &pde->pde_openers, lh) { if (pdeo->file == file) { close_pdeo(pde, pdeo); - break; + return 0; } } spin_unlock(&pde->pde_unload_lock); @@ -386,9 +557,19 @@ static const struct file_operations proc_reg_file_ops = { .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = proc_reg_compat_ioctl, -#endif + .mmap = proc_reg_mmap, + .get_unmapped_area = proc_reg_get_unmapped_area, + .open = proc_reg_open, + .release = proc_reg_release, +}; + +static const struct file_operations proc_iter_file_ops = { + .llseek = proc_reg_llseek, + .read_iter = proc_reg_read_iter, + .write = proc_reg_write, + .splice_read = copy_splice_read, + .poll = proc_reg_poll, + .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, @@ -396,12 +577,27 @@ static const struct file_operations proc_reg_file_ops = { }; #ifdef CONFIG_COMPAT -static const struct file_operations proc_reg_file_ops_no_compat = { +static const struct file_operations proc_reg_file_ops_compat = { .llseek = proc_reg_llseek, .read = proc_reg_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, + .compat_ioctl = proc_reg_compat_ioctl, + .mmap = proc_reg_mmap, + .get_unmapped_area = proc_reg_get_unmapped_area, + .open = proc_reg_open, + .release = proc_reg_release, +}; + +static const struct file_operations proc_iter_file_ops_compat = { + .llseek = proc_reg_llseek, + .read_iter = proc_reg_read_iter, + .splice_read = copy_splice_read, + .write = proc_reg_write, + .poll = proc_reg_poll, + .unlocked_ioctl = proc_reg_unlocked_ioctl, + .compat_ioctl = proc_reg_compat_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, @@ -419,7 +615,7 @@ static const char *proc_get_link(struct dentry *dentry, struct delayed_call *done) { struct proc_dir_entry *pde = PDE(inode); - if (unlikely(!use_pde(pde))) + if (!use_pde(pde)) return ERR_PTR(-EINVAL); set_delayed_call(done, proc_put_link, pde); return pde->data; @@ -431,87 +627,54 @@ const struct inode_operations proc_link_inode_operations = { struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) { - struct inode *inode = new_inode_pseudo(sb); - - if (inode) { - inode->i_ino = de->low_ino; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - PROC_I(inode)->pde = de; - - if (is_empty_pde(de)) { - make_empty_dir_inode(inode); - return inode; - } - if (de->mode) { - inode->i_mode = de->mode; - inode->i_uid = de->uid; - inode->i_gid = de->gid; - } - if (de->size) - inode->i_size = de->size; - if (de->nlink) - set_nlink(inode, de->nlink); - WARN_ON(!de->proc_iops); - inode->i_op = de->proc_iops; - if (de->proc_fops) { - if (S_ISREG(inode->i_mode)) { -#ifdef CONFIG_COMPAT - if (!de->proc_fops->compat_ioctl) - inode->i_fop = - &proc_reg_file_ops_no_compat; - else -#endif - inode->i_fop = &proc_reg_file_ops; - } else { - inode->i_fop = de->proc_fops; - } - } - } else - pde_put(de); - return inode; -} - -int proc_fill_super(struct super_block *s, void *data, int silent) -{ - struct pid_namespace *ns = get_pid_ns(s->s_fs_info); - struct inode *root_inode; - int ret; + struct inode *inode = new_inode(sb); - if (!proc_parse_options(data, ns)) - return -EINVAL; - - /* User space would break if executables or devices appear on proc */ - s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; - s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; - s->s_blocksize = 1024; - s->s_blocksize_bits = 10; - s->s_magic = PROC_SUPER_MAGIC; - s->s_op = &proc_sops; - s->s_time_gran = 1; + if (!inode) { + pde_put(de); + return NULL; + } - /* - * procfs isn't actually a stacking filesystem; however, there is - * too much magic going on inside it to permit stacking things on - * top of it - */ - s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; - - pde_get(&proc_root); - root_inode = proc_get_inode(s, &proc_root); - if (!root_inode) { - pr_err("proc_fill_super: get root inode failed\n"); - return -ENOMEM; + inode->i_private = de->data; + inode->i_ino = de->low_ino; + simple_inode_init_ts(inode); + PROC_I(inode)->pde = de; + if (is_empty_pde(de)) { + make_empty_dir_inode(inode); + return inode; } - s->s_root = d_make_root(root_inode); - if (!s->s_root) { - pr_err("proc_fill_super: allocate dentry failed\n"); - return -ENOMEM; + if (de->mode) { + inode->i_mode = de->mode; + inode->i_uid = de->uid; + inode->i_gid = de->gid; } + if (de->size) + inode->i_size = de->size; + if (de->nlink) + set_nlink(inode, de->nlink); - ret = proc_setup_self(s); - if (ret) { - return ret; + if (S_ISREG(inode->i_mode)) { + inode->i_op = de->proc_iops; + if (pde_has_proc_read_iter(de)) + inode->i_fop = &proc_iter_file_ops; + else + inode->i_fop = &proc_reg_file_ops; +#ifdef CONFIG_COMPAT + if (pde_has_proc_compat_ioctl(de)) { + if (pde_has_proc_read_iter(de)) + inode->i_fop = &proc_iter_file_ops_compat; + else + inode->i_fop = &proc_reg_file_ops_compat; + } +#endif + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = de->proc_iops; + inode->i_fop = de->proc_dir_ops; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = de->proc_iops; + inode->i_fop = NULL; + } else { + BUG(); } - return proc_setup_thread_self(s); + return inode; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index aa2b89071630..c1e8eb984da8 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -1,21 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* Internal procfs definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/proc_fs.h> #include <linux/proc_ns.h> +#include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/binfmts.h> #include <linux/sched/coredump.h> #include <linux/sched/task.h> +#include <linux/mm.h> struct ctl_table_header; struct mempolicy; @@ -31,33 +29,89 @@ struct mempolicy; * subdir_node is used to build the rb tree "subdir" of the parent. */ struct proc_dir_entry { + /* + * number of callers into module in progress; + * negative -> it's going away RSN + */ + atomic_t in_use; + refcount_t refcnt; + struct list_head pde_openers; /* who did ->open, but not ->release */ + /* protects ->pde_openers and all struct pde_opener instances */ + spinlock_t pde_unload_lock; + struct completion *pde_unload_completion; + const struct inode_operations *proc_iops; + union { + const struct proc_ops *proc_ops; + const struct file_operations *proc_dir_ops; + }; + union { + const struct seq_operations *seq_ops; + int (*single_show)(struct seq_file *, void *); + }; + proc_write_t write; + void *data; + unsigned int state_size; unsigned int low_ino; - umode_t mode; nlink_t nlink; kuid_t uid; kgid_t gid; loff_t size; - const struct inode_operations *proc_iops; - const struct file_operations *proc_fops; struct proc_dir_entry *parent; struct rb_root subdir; struct rb_node subdir_node; - void *data; - atomic_t count; /* use count */ - atomic_t in_use; /* number of callers into module in progress; */ - /* negative -> it's going away RSN */ - struct completion *pde_unload_completion; - struct list_head pde_openers; /* who did ->open, but not ->release */ - spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */ + char *name; + umode_t mode; + u8 flags; u8 namelen; - char name[]; + char inline_name[]; } __randomize_layout; +#define SIZEOF_PDE ( \ + sizeof(struct proc_dir_entry) < 128 ? 128 : \ + sizeof(struct proc_dir_entry) < 192 ? 192 : \ + sizeof(struct proc_dir_entry) < 256 ? 256 : \ + sizeof(struct proc_dir_entry) < 512 ? 512 : \ + 0) +#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry)) + +static inline bool pde_is_permanent(const struct proc_dir_entry *pde) +{ + return pde->flags & PROC_ENTRY_PERMANENT; +} + +static inline void pde_make_permanent(struct proc_dir_entry *pde) +{ + pde->flags |= PROC_ENTRY_PERMANENT; +} + +static inline bool pde_has_proc_read_iter(const struct proc_dir_entry *pde) +{ + return pde->flags & PROC_ENTRY_proc_read_iter; +} + +static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde) +{ +#ifdef CONFIG_COMPAT + return pde->flags & PROC_ENTRY_proc_compat_ioctl; +#else + return false; +#endif +} + +static inline bool pde_has_proc_lseek(const struct proc_dir_entry *pde) +{ + return pde->flags & PROC_ENTRY_proc_lseek; +} + +extern struct kmem_cache *proc_dir_entry_cache; +void pde_free(struct proc_dir_entry *pde); + union proc_op { int (*proc_get_link)(struct dentry *, struct path *); int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); + int lsmid; }; struct proc_inode { @@ -66,8 +120,8 @@ struct proc_inode { union proc_op op; struct proc_dir_entry *pde; struct ctl_table_header *sysctl; - struct ctl_table *sysctl_entry; - struct hlist_node sysctl_inodes; + const struct ctl_table *sysctl_entry; + struct hlist_node sibling_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; } __randomize_layout; @@ -85,46 +139,20 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode) return PROC_I(inode)->pde; } -static inline void *__PDE_DATA(const struct inode *inode) -{ - return PDE(inode)->data; -} - -static inline struct pid *proc_pid(struct inode *inode) +static inline struct pid *proc_pid(const struct inode *inode) { return PROC_I(inode)->pid; } -static inline struct task_struct *get_proc_task(struct inode *inode) +static inline struct task_struct *get_proc_task(const struct inode *inode) { return get_pid_task(proc_pid(inode), PIDTYPE_PID); } -void task_dump_owner(struct task_struct *task, mode_t mode, +void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t *ruid, kgid_t *rgid); -static inline unsigned name_to_int(const struct qstr *qstr) -{ - const char *name = qstr->name; - int len = qstr->len; - unsigned n = 0; - - if (len > 1 && *name == '0') - goto out; - while (len-- > 0) { - unsigned c = *name++ - '0'; - if (c > 9) - goto out; - if (n >= (~0U-9)/10) - goto out; - n *= 10; - n += c; - } - return n; -out: - return ~0U; -} - +unsigned name_to_int(const struct qstr *qstr); /* * Offset of the first process in the /proc root directory.. */ @@ -133,11 +161,87 @@ out: /* Worst case buffer size needed for holding an integer. */ #define PROC_NUMBUF 13 +#ifdef CONFIG_PAGE_MAPCOUNT +/** + * folio_precise_page_mapcount() - Number of mappings of this folio page. + * @folio: The folio. + * @page: The page. + * + * The number of present user page table entries that reference this page + * as tracked via the RMAP: either referenced directly (PTE) or as part of + * a larger area that covers this page (e.g., PMD). + * + * Use this function only for the calculation of existing statistics + * (USS, PSS, mapcount_max) and for debugging purposes (/proc/kpagecount). + * + * Do not add new users. + * + * Returns: The number of mappings of this folio page. 0 for + * folios that are not mapped to user space or are not tracked via the RMAP + * (e.g., shared zeropage). + */ +static inline int folio_precise_page_mapcount(struct folio *folio, + struct page *page) +{ + int mapcount = atomic_read(&page->_mapcount) + 1; + + if (page_mapcount_is_type(mapcount)) + mapcount = 0; + if (folio_test_large(folio)) + mapcount += folio_entire_mapcount(folio); + + return mapcount; +} +#else /* !CONFIG_PAGE_MAPCOUNT */ +static inline int folio_precise_page_mapcount(struct folio *folio, + struct page *page) +{ + BUILD_BUG(); +} +#endif /* CONFIG_PAGE_MAPCOUNT */ + +/** + * folio_average_page_mapcount() - Average number of mappings per page in this + * folio + * @folio: The folio. + * + * The average number of user page table entries that reference each page in + * this folio as tracked via the RMAP: either referenced directly (PTE) or + * as part of a larger area that covers this page (e.g., PMD). + * + * The average is calculated by rounding to the nearest integer; however, + * to avoid duplicated code in current callers, the average is at least + * 1 if any page of the folio is mapped. + * + * Returns: The average number of mappings per page in this folio. + */ +static inline int folio_average_page_mapcount(struct folio *folio) +{ + int mapcount, entire_mapcount, avg; + + if (!folio_test_large(folio)) + return atomic_read(&folio->_mapcount) + 1; + + mapcount = folio_large_mapcount(folio); + if (unlikely(mapcount <= 0)) + return 0; + entire_mapcount = folio_entire_mapcount(folio); + if (mapcount <= entire_mapcount) + return entire_mapcount; + mapcount -= entire_mapcount; + + /* Round to closest integer ... */ + avg = ((unsigned int)mapcount + folio_large_nr_pages(folio) / 2) >> folio_large_order(folio); + /* ... but return at least 1. */ + return max_t(int, avg + entire_mapcount, 1); +} /* * array.c */ extern const struct file_operations proc_tid_children_operations; +extern void proc_task_name(struct seq_file *m, struct task_struct *p, + bool escape); extern int proc_tid_stat(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *, @@ -151,34 +255,39 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, * base.c */ extern const struct dentry_operations pid_dentry_operations; -extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int); -extern int proc_setattr(struct dentry *, struct iattr *); +extern int pid_getattr(struct mnt_idmap *, const struct path *, + struct kstat *, u32, unsigned int); +extern int proc_setattr(struct mnt_idmap *, struct dentry *, + struct iattr *); +extern void proc_pid_evict_inode(struct proc_inode *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); -extern int pid_revalidate(struct dentry *, unsigned int); +extern void pid_update_inode(struct task_struct *, struct inode *); extern int pid_delete_dentry(const struct dentry *); extern int proc_pid_readdir(struct file *, struct dir_context *); -extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); +struct dentry *proc_pid_lookup(struct dentry *, unsigned int); extern loff_t mem_lseek(struct file *, loff_t, int); /* Lookups */ -typedef int instantiate_t(struct inode *, struct dentry *, +typedef struct dentry *instantiate_t(struct dentry *, struct task_struct *, const void *); -extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int, +bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int, instantiate_t, struct task_struct *, const void *); /* * generic.c */ +struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, + struct proc_dir_entry **parent, void *data); +struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, + struct proc_dir_entry *dp); extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); -extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, - struct dentry *); +struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *); extern int proc_readdir(struct file *, struct dir_context *); -extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *); +int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *); -static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) +static inline void pde_get(struct proc_dir_entry *pde) { - atomic_inc(&pde->count); - return pde; + refcount_inc(&pde->refcnt); } extern void pde_put(struct proc_dir_entry *); @@ -186,24 +295,25 @@ static inline bool is_empty_pde(const struct proc_dir_entry *pde) { return S_ISDIR(pde->mode) && !pde->proc_iops; } +extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *); /* * inode.c */ struct pde_opener { - struct file *file; struct list_head lh; + struct file *file; bool closing; struct completion *c; -}; +} __randomize_layout; extern const struct inode_operations proc_link_inode_operations; - extern const struct inode_operations proc_pid_link_inode_operations; +extern const struct super_operations proc_sops; -extern void proc_init_inodecache(void); +void proc_init_kmemcache(void); +void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); -extern int proc_fill_super(struct super_block *, void *data, int flags); extern void proc_entry_rundown(struct proc_dir_entry *); /* @@ -261,21 +371,29 @@ static inline void proc_tty_init(void) {} * root.c */ extern struct proc_dir_entry proc_root; -extern int proc_parse_options(char *options, struct pid_namespace *pid); extern void proc_self_init(void); -extern int proc_remount(struct super_block *, int *, char *); +extern unsigned self_inum, thread_self_inum; /* * task_[no]mmu.c */ +struct mem_size_stats; + +struct proc_maps_locking_ctx { + struct mm_struct *mm; +#ifdef CONFIG_PER_VMA_LOCK + bool mmap_locked; + struct vm_area_struct *locked_vma; +#endif +}; + struct proc_maps_private { struct inode *inode; struct task_struct *task; - struct mm_struct *mm; -#ifdef CONFIG_MMU - struct vm_area_struct *tail_vma; -#endif + struct vma_iterator iter; + loff_t last_pos; + struct proc_maps_locking_ctx lock_ctx; #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif @@ -284,11 +402,9 @@ struct proc_maps_private { struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); extern const struct file_operations proc_pid_maps_operations; -extern const struct file_operations proc_tid_maps_operations; extern const struct file_operations proc_pid_numa_maps_operations; -extern const struct file_operations proc_tid_numa_maps_operations; extern const struct file_operations proc_pid_smaps_operations; -extern const struct file_operations proc_tid_smaps_operations; +extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; @@ -297,3 +413,22 @@ extern unsigned long task_statm(struct mm_struct *, unsigned long *, unsigned long *, unsigned long *, unsigned long *); extern void task_mem(struct seq_file *, struct mm_struct *); + +extern const struct dentry_operations proc_net_dentry_ops; +static inline void pde_force_lookup(struct proc_dir_entry *pde) +{ + /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ + pde->flags |= PROC_ENTRY_FORCE_LOOKUP; +} + +/* + * Add a new procfs dentry that can't serve as a mountpoint. That should + * encompass anything that is ephemeral and can just disappear while the + * process is still around. + */ +static inline struct dentry *proc_splice_unmountable(struct inode *inode, + struct dentry *dentry, const struct dentry_operations *d_ops) +{ + dont_mount(dentry); + return d_splice_alias_ops(inode, dentry, d_ops); +} diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c index a352d5703b41..714a22ded8a8 100644 --- a/fs/proc/interrupts.c +++ b/fs/proc/interrupts.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/interrupt.h> @@ -10,13 +11,13 @@ */ static void *int_seq_start(struct seq_file *f, loff_t *pos) { - return (*pos <= nr_irqs) ? pos : NULL; + return *pos <= irq_get_nr_irqs() ? pos : NULL; } static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos) { (*pos)++; - if (*pos > nr_irqs) + if (*pos > irq_get_nr_irqs()) return NULL; return pos; } @@ -33,21 +34,9 @@ static const struct seq_operations int_seq_ops = { .show = show_interrupts }; -static int interrupts_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &int_seq_ops); -} - -static const struct file_operations proc_interrupts_operations = { - .open = interrupts_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_interrupts_init(void) { - proc_create("interrupts", 0, NULL, &proc_interrupts_operations); + proc_create_seq("interrupts", 0, NULL, &int_seq_ops); return 0; } fs_initcall(proc_interrupts_init); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 45629f4b5402..728630b10fdf 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/proc/kcore.c kernel ELF core dumper * @@ -9,6 +10,7 @@ * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com> */ +#include <linux/vmcore_info.h> #include <linux/mm.h> #include <linux/proc_fs.h> #include <linux/kcore.h> @@ -16,24 +18,22 @@ #include <linux/capability.h> #include <linux/elf.h> #include <linux/elfcore.h> -#include <linux/notifier.h> #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/printk.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/init.h> #include <linux/slab.h> -#include <linux/uaccess.h> +#include <linux/uio.h> #include <asm/io.h> #include <linux/list.h> #include <linux/ioport.h> #include <linux/memory.h> #include <linux/sched/task.h> +#include <linux/security.h> #include <asm/sections.h> #include "internal.h" -#define CORE_STR "CORE" - #ifndef ELF_CORE_EFLAGS #define ELF_CORE_EFLAGS 0 #endif @@ -48,104 +48,100 @@ static struct proc_dir_entry *proc_root_kcore; #define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET) #endif -/* An ELF note in memory */ -struct memelfnote +#ifndef kc_xlate_dev_mem_ptr +#define kc_xlate_dev_mem_ptr kc_xlate_dev_mem_ptr +static inline void *kc_xlate_dev_mem_ptr(phys_addr_t phys) { - const char *name; - int type; - unsigned int datasz; - void *data; -}; + return __va(phys); +} +#endif +#ifndef kc_unxlate_dev_mem_ptr +#define kc_unxlate_dev_mem_ptr kc_unxlate_dev_mem_ptr +static inline void kc_unxlate_dev_mem_ptr(phys_addr_t phys, void *virt) +{ +} +#endif static LIST_HEAD(kclist_head); -static DEFINE_RWLOCK(kclist_lock); +static int kcore_nphdr; +static size_t kcore_phdrs_len; +static size_t kcore_notes_len; +static size_t kcore_data_offset; +DEFINE_STATIC_PERCPU_RWSEM(kclist_lock); static int kcore_need_update = 1; -void -kclist_add(struct kcore_list *new, void *addr, size_t size, int type) +/* + * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error + * Same as oldmem_pfn_is_ram in vmcore + */ +static int (*mem_pfn_is_ram)(unsigned long pfn); + +int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn)) +{ + if (mem_pfn_is_ram) + return -EBUSY; + mem_pfn_is_ram = fn; + return 0; +} + +static int pfn_is_ram(unsigned long pfn) +{ + if (mem_pfn_is_ram) + return mem_pfn_is_ram(pfn); + else + return 1; +} + +/* This doesn't grab kclist_lock, so it should only be used at init time. */ +void __init kclist_add(struct kcore_list *new, void *addr, size_t size, + int type) { new->addr = (unsigned long)addr; new->size = size; new->type = type; - write_lock(&kclist_lock); list_add_tail(&new->list, &kclist_head); - write_unlock(&kclist_lock); } -static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) +static void update_kcore_size(void) { size_t try, size; struct kcore_list *m; - *nphdr = 1; /* PT_NOTE */ + kcore_nphdr = 1; /* PT_NOTE */ size = 0; list_for_each_entry(m, &kclist_head, list) { try = kc_vaddr_to_offset((size_t)m->addr + m->size); if (try > size) size = try; - *nphdr = *nphdr + 1; - } - *elf_buflen = sizeof(struct elfhdr) + - (*nphdr + 2)*sizeof(struct elf_phdr) + - 3 * ((sizeof(struct elf_note)) + - roundup(sizeof(CORE_STR), 4)) + - roundup(sizeof(struct elf_prstatus), 4) + - roundup(sizeof(struct elf_prpsinfo), 4) + - roundup(arch_task_struct_size, 4); - *elf_buflen = PAGE_ALIGN(*elf_buflen); - return size + *elf_buflen; -} - -static void free_kclist_ents(struct list_head *head) -{ - struct kcore_list *tmp, *pos; - - list_for_each_entry_safe(pos, tmp, head, list) { - list_del(&pos->list); - kfree(pos); + kcore_nphdr++; } -} -/* - * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list. - */ -static void __kcore_update_ram(struct list_head *list) -{ - int nphdr; - size_t size; - struct kcore_list *tmp, *pos; - LIST_HEAD(garbage); - write_lock(&kclist_lock); - if (kcore_need_update) { - list_for_each_entry_safe(pos, tmp, &kclist_head, list) { - if (pos->type == KCORE_RAM - || pos->type == KCORE_VMEMMAP) - list_move(&pos->list, &garbage); - } - list_splice_tail(list, &kclist_head); - } else - list_splice(list, &garbage); - kcore_need_update = 0; - proc_root_kcore->size = get_kcore_size(&nphdr, &size); - write_unlock(&kclist_lock); - - free_kclist_ents(&garbage); + kcore_phdrs_len = kcore_nphdr * sizeof(struct elf_phdr); + kcore_notes_len = (4 * sizeof(struct elf_note) + + ALIGN(sizeof(NN_PRSTATUS), 4) + + ALIGN(sizeof(NN_PRPSINFO), 4) + + ALIGN(sizeof(NN_TASKSTRUCT), 4) + + VMCOREINFO_NOTE_NAME_BYTES + + ALIGN(sizeof(struct elf_prstatus), 4) + + ALIGN(sizeof(struct elf_prpsinfo), 4) + + ALIGN(arch_task_struct_size, 4) + + ALIGN(vmcoreinfo_size, 4)); + kcore_data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + kcore_phdrs_len + + kcore_notes_len); + proc_root_kcore->size = kcore_data_offset + size; } - #ifdef CONFIG_HIGHMEM /* * If no highmem, we can assume [0...max_low_pfn) continuous range of memory * because memory hole is not as big as !HIGHMEM case. * (HIGHMEM is special because part of memory is _invisible_ from the kernel.) */ -static int kcore_update_ram(void) +static int kcore_ram_list(struct list_head *head) { - LIST_HEAD(head); struct kcore_list *ent; - int ret = 0; ent = kmalloc(sizeof(*ent), GFP_KERNEL); if (!ent) @@ -153,9 +149,8 @@ static int kcore_update_ram(void) ent->addr = (unsigned long)__va(0); ent->size = max_low_pfn << PAGE_SHIFT; ent->type = KCORE_RAM; - list_add(&ent->list, &head); - __kcore_update_ram(&head); - return ret; + list_add(&ent->list, head); + return 0; } #else /* !CONFIG_HIGHMEM */ @@ -208,25 +203,32 @@ kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg) { struct list_head *head = (struct list_head *)arg; struct kcore_list *ent; + struct page *p; + + if (!pfn_valid(pfn)) + return 1; + + p = pfn_to_page(pfn); ent = kmalloc(sizeof(*ent), GFP_KERNEL); if (!ent) return -ENOMEM; - ent->addr = (unsigned long)__va((pfn << PAGE_SHIFT)); + ent->addr = (unsigned long)page_to_virt(p); ent->size = nr_pages << PAGE_SHIFT; - /* Sanity check: Can happen in 32bit arch...maybe */ - if (ent->addr < (unsigned long) __va(0)) + if (!virt_addr_valid((void *)ent->addr)) goto free_out; /* cut not-mapped area. ....from ppc-32 code. */ if (ULONG_MAX - ent->addr < ent->size) ent->size = ULONG_MAX - ent->addr; - /* cut when vmalloc() area is higher than direct-map area */ - if (VMALLOC_START > (unsigned long)__va(0)) { - if (ent->addr > VMALLOC_START) - goto free_out; + /* + * We've already checked virt_addr_valid so we know this address + * is a valid pointer, therefore we can check against it to determine + * if we need to trim + */ + if (VMALLOC_START > ent->addr) { if (VMALLOC_START - ent->addr < ent->size) ent->size = VMALLOC_START - ent->addr; } @@ -245,13 +247,12 @@ free_out: return 1; } -static int kcore_update_ram(void) +static int kcore_ram_list(struct list_head *list) { int nid, ret; unsigned long end_pfn; - LIST_HEAD(head); - /* Not inialized....update now */ + /* Not initialized....update now */ /* find out "max pfn" */ end_pfn = 0; for_each_node_state(nid, N_MEMORY) { @@ -261,297 +262,386 @@ static int kcore_update_ram(void) end_pfn = node_end; } /* scan 0 to max_pfn */ - ret = walk_system_ram_range(0, end_pfn, &head, kclist_add_private); - if (ret) { - free_kclist_ents(&head); + ret = walk_system_ram_range(0, end_pfn, list, kclist_add_private); + if (ret) return -ENOMEM; - } - __kcore_update_ram(&head); - return ret; + return 0; } #endif /* CONFIG_HIGHMEM */ -/*****************************************************************************/ -/* - * determine size of ELF note - */ -static int notesize(struct memelfnote *en) -{ - int sz; - - sz = sizeof(struct elf_note); - sz += roundup((strlen(en->name) + 1), 4); - sz += roundup(en->datasz, 4); - - return sz; -} /* end notesize() */ - -/*****************************************************************************/ -/* - * store a note in the header buffer - */ -static char *storenote(struct memelfnote *men, char *bufp) +static int kcore_update_ram(void) { - struct elf_note en; + LIST_HEAD(list); + LIST_HEAD(garbage); + struct kcore_list *tmp, *pos; + int ret = 0; -#define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0) + percpu_down_write(&kclist_lock); + if (!xchg(&kcore_need_update, 0)) + goto out; - en.n_namesz = strlen(men->name) + 1; - en.n_descsz = men->datasz; - en.n_type = men->type; + ret = kcore_ram_list(&list); + if (ret) { + /* Couldn't get the RAM list, try again next time. */ + WRITE_ONCE(kcore_need_update, 1); + list_splice_tail(&list, &garbage); + goto out; + } - DUMP_WRITE(&en, sizeof(en)); - DUMP_WRITE(men->name, en.n_namesz); + list_for_each_entry_safe(pos, tmp, &kclist_head, list) { + if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP) + list_move(&pos->list, &garbage); + } + list_splice_tail(&list, &kclist_head); - /* XXX - cast from long long to long to avoid need for libgcc.a */ - bufp = (char*) roundup((unsigned long)bufp,4); - DUMP_WRITE(men->data, men->datasz); - bufp = (char*) roundup((unsigned long)bufp,4); + update_kcore_size(); -#undef DUMP_WRITE +out: + percpu_up_write(&kclist_lock); + list_for_each_entry_safe(pos, tmp, &garbage, list) { + list_del(&pos->list); + kfree(pos); + } + return ret; +} - return bufp; -} /* end storenote() */ +static void append_kcore_note(char *notes, size_t *i, const char *name, + unsigned int type, const void *desc, + size_t descsz) +{ + struct elf_note *note = (struct elf_note *)¬es[*i]; + + note->n_namesz = strlen(name) + 1; + note->n_descsz = descsz; + note->n_type = type; + *i += sizeof(*note); + memcpy(¬es[*i], name, note->n_namesz); + *i = ALIGN(*i + note->n_namesz, 4); + memcpy(¬es[*i], desc, descsz); + *i = ALIGN(*i + descsz, 4); +} -/* - * store an ELF coredump header in the supplied buffer - * nphdr is the number of elf_phdr to insert - */ -static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) +static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) { - struct elf_prstatus prstatus; /* NT_PRSTATUS */ - struct elf_prpsinfo prpsinfo; /* NT_PRPSINFO */ - struct elf_phdr *nhdr, *phdr; - struct elfhdr *elf; - struct memelfnote notes[3]; - off_t offset = 0; + struct file *file = iocb->ki_filp; + char *buf = file->private_data; + loff_t *fpos = &iocb->ki_pos; + size_t phdrs_offset, notes_offset; + size_t page_offline_frozen = 1; struct kcore_list *m; + size_t tsz; + unsigned long start; + size_t buflen = iov_iter_count(iter); + size_t orig_buflen = buflen; + int ret = 0; - /* setup ELF header */ - elf = (struct elfhdr *) bufp; - bufp += sizeof(struct elfhdr); - offset += sizeof(struct elfhdr); - memcpy(elf->e_ident, ELFMAG, SELFMAG); - elf->e_ident[EI_CLASS] = ELF_CLASS; - elf->e_ident[EI_DATA] = ELF_DATA; - elf->e_ident[EI_VERSION]= EV_CURRENT; - elf->e_ident[EI_OSABI] = ELF_OSABI; - memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); - elf->e_type = ET_CORE; - elf->e_machine = ELF_ARCH; - elf->e_version = EV_CURRENT; - elf->e_entry = 0; - elf->e_phoff = sizeof(struct elfhdr); - elf->e_shoff = 0; - elf->e_flags = ELF_CORE_EFLAGS; - elf->e_ehsize = sizeof(struct elfhdr); - elf->e_phentsize= sizeof(struct elf_phdr); - elf->e_phnum = nphdr; - elf->e_shentsize= 0; - elf->e_shnum = 0; - elf->e_shstrndx = 0; - - /* setup ELF PT_NOTE program header */ - nhdr = (struct elf_phdr *) bufp; - bufp += sizeof(struct elf_phdr); - offset += sizeof(struct elf_phdr); - nhdr->p_type = PT_NOTE; - nhdr->p_offset = 0; - nhdr->p_vaddr = 0; - nhdr->p_paddr = 0; - nhdr->p_filesz = 0; - nhdr->p_memsz = 0; - nhdr->p_flags = 0; - nhdr->p_align = 0; - - /* setup ELF PT_LOAD program header for every area */ - list_for_each_entry(m, &kclist_head, list) { - phdr = (struct elf_phdr *) bufp; - bufp += sizeof(struct elf_phdr); - offset += sizeof(struct elf_phdr); - - phdr->p_type = PT_LOAD; - phdr->p_flags = PF_R|PF_W|PF_X; - phdr->p_offset = kc_vaddr_to_offset(m->addr) + dataoff; - phdr->p_vaddr = (size_t)m->addr; - if (m->type == KCORE_RAM || m->type == KCORE_TEXT) - phdr->p_paddr = __pa(m->addr); - else - phdr->p_paddr = (elf_addr_t)-1; - phdr->p_filesz = phdr->p_memsz = m->size; - phdr->p_align = PAGE_SIZE; - } - + percpu_down_read(&kclist_lock); /* - * Set up the notes in similar form to SVR4 core dumps made - * with info from their /proc. + * Don't race against drivers that set PageOffline() and expect no + * further page access. */ - nhdr->p_offset = offset; - - /* set up the process status */ - notes[0].name = CORE_STR; - notes[0].type = NT_PRSTATUS; - notes[0].datasz = sizeof(struct elf_prstatus); - notes[0].data = &prstatus; - - memset(&prstatus, 0, sizeof(struct elf_prstatus)); - - nhdr->p_filesz = notesize(¬es[0]); - bufp = storenote(¬es[0], bufp); - - /* set up the process info */ - notes[1].name = CORE_STR; - notes[1].type = NT_PRPSINFO; - notes[1].datasz = sizeof(struct elf_prpsinfo); - notes[1].data = &prpsinfo; - - memset(&prpsinfo, 0, sizeof(struct elf_prpsinfo)); - prpsinfo.pr_state = 0; - prpsinfo.pr_sname = 'R'; - prpsinfo.pr_zomb = 0; - - strcpy(prpsinfo.pr_fname, "vmlinux"); - strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs)); + page_offline_freeze(); + + phdrs_offset = sizeof(struct elfhdr); + notes_offset = phdrs_offset + kcore_phdrs_len; + + /* ELF file header. */ + if (buflen && *fpos < sizeof(struct elfhdr)) { + struct elfhdr ehdr = { + .e_ident = { + [EI_MAG0] = ELFMAG0, + [EI_MAG1] = ELFMAG1, + [EI_MAG2] = ELFMAG2, + [EI_MAG3] = ELFMAG3, + [EI_CLASS] = ELF_CLASS, + [EI_DATA] = ELF_DATA, + [EI_VERSION] = EV_CURRENT, + [EI_OSABI] = ELF_OSABI, + }, + .e_type = ET_CORE, + .e_machine = ELF_ARCH, + .e_version = EV_CURRENT, + .e_phoff = sizeof(struct elfhdr), + .e_flags = ELF_CORE_EFLAGS, + .e_ehsize = sizeof(struct elfhdr), + .e_phentsize = sizeof(struct elf_phdr), + .e_phnum = kcore_nphdr, + }; + + tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos); + if (copy_to_iter((char *)&ehdr + *fpos, tsz, iter) != tsz) { + ret = -EFAULT; + goto out; + } - nhdr->p_filesz += notesize(¬es[1]); - bufp = storenote(¬es[1], bufp); + buflen -= tsz; + *fpos += tsz; + } - /* set up the task structure */ - notes[2].name = CORE_STR; - notes[2].type = NT_TASKSTRUCT; - notes[2].datasz = arch_task_struct_size; - notes[2].data = current; + /* ELF program headers. */ + if (buflen && *fpos < phdrs_offset + kcore_phdrs_len) { + struct elf_phdr *phdrs, *phdr; - nhdr->p_filesz += notesize(¬es[2]); - bufp = storenote(¬es[2], bufp); + phdrs = kzalloc(kcore_phdrs_len, GFP_KERNEL); + if (!phdrs) { + ret = -ENOMEM; + goto out; + } -} /* end elf_kcore_store_hdr() */ + phdrs[0].p_type = PT_NOTE; + phdrs[0].p_offset = notes_offset; + phdrs[0].p_filesz = kcore_notes_len; -/*****************************************************************************/ -/* - * read from the ELF header and then kernel memory - */ -static ssize_t -read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) -{ - char *buf = file->private_data; - ssize_t acc = 0; - size_t size, tsz; - size_t elf_buflen; - int nphdr; - unsigned long start; + phdr = &phdrs[1]; + list_for_each_entry(m, &kclist_head, list) { + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R | PF_W | PF_X; + phdr->p_offset = kc_vaddr_to_offset(m->addr) + + kcore_data_offset; + phdr->p_vaddr = (size_t)m->addr; + if (m->type == KCORE_RAM) + phdr->p_paddr = __pa(m->addr); + else if (m->type == KCORE_TEXT) + phdr->p_paddr = __pa_symbol(m->addr); + else + phdr->p_paddr = (elf_addr_t)-1; + phdr->p_filesz = phdr->p_memsz = m->size; + phdr->p_align = PAGE_SIZE; + phdr++; + } - read_lock(&kclist_lock); - size = get_kcore_size(&nphdr, &elf_buflen); + tsz = min_t(size_t, buflen, + phdrs_offset + kcore_phdrs_len - *fpos); + if (copy_to_iter((char *)phdrs + *fpos - phdrs_offset, tsz, + iter) != tsz) { + kfree(phdrs); + ret = -EFAULT; + goto out; + } + kfree(phdrs); - if (buflen == 0 || *fpos >= size) { - read_unlock(&kclist_lock); - return 0; + buflen -= tsz; + *fpos += tsz; } - /* trim buflen to not go beyond EOF */ - if (buflen > size - *fpos) - buflen = size - *fpos; - - /* construct an ELF core header if we'll need some of it */ - if (*fpos < elf_buflen) { - char * elf_buf; - - tsz = elf_buflen - *fpos; - if (buflen < tsz) - tsz = buflen; - elf_buf = kzalloc(elf_buflen, GFP_ATOMIC); - if (!elf_buf) { - read_unlock(&kclist_lock); - return -ENOMEM; + /* ELF note segment. */ + if (buflen && *fpos < notes_offset + kcore_notes_len) { + struct elf_prstatus prstatus = {}; + struct elf_prpsinfo prpsinfo = { + .pr_sname = 'R', + .pr_fname = "vmlinux", + }; + char *notes; + size_t i = 0; + + strscpy(prpsinfo.pr_psargs, saved_command_line, + sizeof(prpsinfo.pr_psargs)); + + notes = kzalloc(kcore_notes_len, GFP_KERNEL); + if (!notes) { + ret = -ENOMEM; + goto out; } - elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen); - read_unlock(&kclist_lock); - if (copy_to_user(buffer, elf_buf + *fpos, tsz)) { - kfree(elf_buf); - return -EFAULT; + + append_kcore_note(notes, &i, NN_PRSTATUS, NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + append_kcore_note(notes, &i, NN_PRPSINFO, NT_PRPSINFO, &prpsinfo, + sizeof(prpsinfo)); + append_kcore_note(notes, &i, NN_TASKSTRUCT, NT_TASKSTRUCT, current, + arch_task_struct_size); + /* + * vmcoreinfo_size is mostly constant after init time, but it + * can be changed by crash_save_vmcoreinfo(). Racing here with a + * panic on another CPU before the machine goes down is insanely + * unlikely, but it's better to not leave potential buffer + * overflows lying around, regardless. + */ + append_kcore_note(notes, &i, VMCOREINFO_NOTE_NAME, 0, + vmcoreinfo_data, + min(vmcoreinfo_size, kcore_notes_len - i)); + + tsz = min_t(size_t, buflen, + notes_offset + kcore_notes_len - *fpos); + if (copy_to_iter(notes + *fpos - notes_offset, tsz, iter) != tsz) { + kfree(notes); + ret = -EFAULT; + goto out; } - kfree(elf_buf); + kfree(notes); + buflen -= tsz; *fpos += tsz; - buffer += tsz; - acc += tsz; - - /* leave now if filled buffer already */ - if (buflen == 0) - return acc; - } else - read_unlock(&kclist_lock); + } /* * Check to see if our file offset matches with any of * the addresses in the elf_phdr on our list. */ - start = kc_offset_to_vaddr(*fpos - elf_buflen); + start = kc_offset_to_vaddr(*fpos - kcore_data_offset); if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) tsz = buflen; - + + m = NULL; while (buflen) { - struct kcore_list *m; + struct page *page; + unsigned long pfn; + phys_addr_t phys; + void *__start; + + /* + * If this is the first iteration or the address is not within + * the previous entry, search for a matching entry. + */ + if (!m || start < m->addr || start >= m->addr + m->size) { + struct kcore_list *pos; + + m = NULL; + list_for_each_entry(pos, &kclist_head, list) { + if (start >= pos->addr && + start < pos->addr + pos->size) { + m = pos; + break; + } + } + } - read_lock(&kclist_lock); - list_for_each_entry(m, &kclist_head, list) { - if (start >= m->addr && start < (m->addr+m->size)) - break; + if (page_offline_frozen++ % MAX_ORDER_NR_PAGES == 0) { + page_offline_thaw(); + cond_resched(); + page_offline_freeze(); } - read_unlock(&kclist_lock); - - if (&m->list == &kclist_head) { - if (clear_user(buffer, tsz)) - return -EFAULT; - } else if (m->type == KCORE_VMALLOC) { - vread(buf, (char *)start, tsz); - /* we have to zero-fill user buffer even if no read */ - if (copy_to_user(buffer, buf, tsz)) - return -EFAULT; - } else { - if (kern_addr_valid(start)) { - unsigned long n; - - /* - * Using bounce buffer to bypass the - * hardened user copy kernel text checks. - */ - memcpy(buf, (char *) start, tsz); - n = copy_to_user(buffer, buf, tsz); - /* - * We cannot distinguish between fault on source - * and fault on destination. When this happens - * we clear too and hope it will trigger the - * EFAULT again. - */ - if (n) { - if (clear_user(buffer + tsz - n, - n)) - return -EFAULT; + + if (!m) { + if (iov_iter_zero(tsz, iter) != tsz) { + ret = -EFAULT; + goto out; + } + goto skip; + } + + switch (m->type) { + case KCORE_VMALLOC: + { + const char *src = (char *)start; + size_t read = 0, left = tsz; + + /* + * vmalloc uses spinlocks, so we optimistically try to + * read memory. If this fails, fault pages in and try + * again until we are done. + */ + while (true) { + read += vread_iter(iter, src, left); + if (read == tsz) + break; + + src += read; + left -= read; + + if (fault_in_iov_iter_writeable(iter, left)) { + ret = -EFAULT; + goto out; + } + } + break; + } + case KCORE_USER: + /* User page is handled prior to normal kernel page: */ + if (copy_to_iter((char *)start, tsz, iter) != tsz) { + ret = -EFAULT; + goto out; + } + break; + case KCORE_RAM: + phys = __pa(start); + pfn = phys >> PAGE_SHIFT; + page = pfn_to_online_page(pfn); + + /* + * Don't read offline sections, logically offline pages + * (e.g., inflated in a balloon), hwpoisoned pages, + * and explicitly excluded physical ranges. + */ + if (!page || PageOffline(page) || + is_page_hwpoison(page) || !pfn_is_ram(pfn) || + pfn_is_unaccepted_memory(pfn)) { + if (iov_iter_zero(tsz, iter) != tsz) { + ret = -EFAULT; + goto out; + } + break; + } + fallthrough; + case KCORE_VMEMMAP: + case KCORE_TEXT: + if (m->type == KCORE_RAM) { + __start = kc_xlate_dev_mem_ptr(phys); + if (!__start) { + ret = -ENOMEM; + if (iov_iter_zero(tsz, iter) != tsz) + ret = -EFAULT; + goto out; } } else { - if (clear_user(buffer, tsz)) - return -EFAULT; + __start = (void *)start; + } + + /* + * Sadly we must use a bounce buffer here to be able to + * make use of copy_from_kernel_nofault(), as these + * memory regions might not always be mapped on all + * architectures. + */ + ret = copy_from_kernel_nofault(buf, __start, tsz); + if (m->type == KCORE_RAM) + kc_unxlate_dev_mem_ptr(phys, __start); + if (ret) { + if (iov_iter_zero(tsz, iter) != tsz) { + ret = -EFAULT; + goto out; + } + ret = 0; + /* + * We know the bounce buffer is safe to copy from, so + * use _copy_to_iter() directly. + */ + } else if (_copy_to_iter(buf, tsz, iter) != tsz) { + ret = -EFAULT; + goto out; + } + break; + default: + pr_warn_once("Unhandled KCORE type: %d\n", m->type); + if (iov_iter_zero(tsz, iter) != tsz) { + ret = -EFAULT; + goto out; } } +skip: buflen -= tsz; *fpos += tsz; - buffer += tsz; - acc += tsz; start += tsz; tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen); } - return acc; +out: + page_offline_thaw(); + percpu_up_read(&kclist_lock); + if (ret) + return ret; + return orig_buflen - buflen; } - static int open_kcore(struct inode *inode, struct file *filp) { + int ret = security_locked_down(LOCKDOWN_KCORE); + if (!capable(CAP_SYS_RAWIO)) return -EPERM; + if (ret) + return ret; + filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!filp->private_data) return -ENOMEM; @@ -572,11 +662,12 @@ static int release_kcore(struct inode *inode, struct file *file) return 0; } -static const struct file_operations proc_kcore_operations = { - .read = read_kcore, - .open = open_kcore, - .release = release_kcore, - .llseek = default_llseek, +static const struct proc_ops kcore_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_read_iter = read_kcore_iter, + .proc_open = open_kcore, + .proc_release = release_kcore, + .proc_lseek = default_llseek, }; /* just remember that we have to update kcore */ @@ -586,17 +677,12 @@ static int __meminit kcore_callback(struct notifier_block *self, switch (action) { case MEM_ONLINE: case MEM_OFFLINE: - write_lock(&kclist_lock); kcore_need_update = 1; - write_unlock(&kclist_lock); + break; } return NOTIFY_OK; } -static struct notifier_block kcore_callback_nb __meminitdata = { - .notifier_call = kcore_callback, - .priority = 0, -}; static struct kcore_list kcore_vmalloc; @@ -620,7 +706,7 @@ static void __init proc_kcore_text_init(void) /* * MODULES_VADDR has no intersection with VMALLOC_ADDR. */ -struct kcore_list kcore_modules; +static struct kcore_list kcore_modules; static void __init add_modules_range(void) { if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) { @@ -636,8 +722,7 @@ static void __init add_modules_range(void) static int __init proc_kcore_init(void) { - proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, - &proc_kcore_operations); + proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &kcore_proc_ops); if (!proc_root_kcore) { pr_err("couldn't create /proc/kcore\n"); return 0; /* Always returns 0. */ @@ -650,7 +735,7 @@ static int __init proc_kcore_init(void) add_modules_range(); /* Store direct-map area from physical memory map */ kcore_update_ram(); - register_hotmemory_notifier(&kcore_callback_nb); + hotplug_memory_notifier(kcore_callback, DEFAULT_CALLBACK_PRI); return 0; } diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index f9387bb7631b..2fc92a13f9f8 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/kmsg.c * @@ -14,11 +15,8 @@ #include <linux/fs.h> #include <linux/syslog.h> -#include <linux/uaccess.h> #include <asm/io.h> -extern wait_queue_head_t log_wait; - static int kmsg_open(struct inode * inode, struct file * file) { return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC); @@ -39,26 +37,27 @@ static ssize_t kmsg_read(struct file *file, char __user *buf, return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC); } -static unsigned int kmsg_poll(struct file *file, poll_table *wait) +static __poll_t kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) - return POLLIN | POLLRDNORM; + return EPOLLIN | EPOLLRDNORM; return 0; } -static const struct file_operations proc_kmsg_operations = { - .read = kmsg_read, - .poll = kmsg_poll, - .open = kmsg_open, - .release = kmsg_release, - .llseek = generic_file_llseek, +static const struct proc_ops kmsg_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_read = kmsg_read, + .proc_poll = kmsg_poll, + .proc_open = kmsg_open, + .proc_release = kmsg_release, + .proc_lseek = generic_file_llseek, }; static int __init proc_kmsg_init(void) { - proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); + proc_create("kmsg", S_IRUSR, NULL, &kmsg_proc_ops); return 0; } fs_initcall(proc_kmsg_init); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 983fce5c2418..817981e57223 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/pid_namespace.h> @@ -8,9 +9,7 @@ #include <linux/seq_file.h> #include <linux/seqlock.h> #include <linux/time.h> - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) +#include "internal.h" static int loadavg_proc_show(struct seq_file *m, void *v) { @@ -18,30 +17,21 @@ static int loadavg_proc_show(struct seq_file *m, void *v) get_avenrun(avnrun, FIXED_1/200, 0); - seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n", + seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %u/%d %d\n", LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]), LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), nr_running(), nr_threads, - task_active_pid_ns(current)->last_pid); + idr_get_cursor(&task_active_pid_ns(current)->idr) - 1); return 0; } -static int loadavg_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, loadavg_proc_show, NULL); -} - -static const struct file_operations loadavg_proc_fops = { - .open = loadavg_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_loadavg_init(void) { - proc_create("loadavg", 0, NULL, &loadavg_proc_fops); + struct proc_dir_entry *pde; + + pde = proc_create_single("loadavg", 0, NULL, loadavg_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_loadavg_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8a428498d6b2..a458f1e112fd 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> @@ -5,8 +6,9 @@ #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/mmzone.h> +#include <linux/memblock.h> #include <linux/proc_fs.h> -#include <linux/quicklist.h> +#include <linux/percpu.h> #include <linux/seq_file.h> #include <linux/swap.h> #include <linux/vmstat.h> @@ -15,8 +17,8 @@ #ifdef CONFIG_CMA #include <linux/cma.h> #endif +#include <linux/zswap.h> #include <asm/page.h> -#include <asm/pgtable.h> #include "internal.h" void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) @@ -25,20 +27,7 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) { - char v[32]; - static const char blanks[7] = {' ', ' ', ' ', ' ',' ', ' ', ' '}; - int len; - - len = num_to_str(v, sizeof(v), num << (PAGE_SHIFT - 10)); - - seq_write(m, s, 16); - - if (len > 0) { - if (len < 8) - seq_write(m, blanks, 8 - len); - - seq_write(m, v, len); - } + seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); seq_write(m, " kB\n", 4); } @@ -49,11 +38,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v) long cached; long available; unsigned long pages[NR_LRU_LISTS]; + unsigned long sreclaimable, sunreclaim; int lru; si_meminfo(&i); si_swapinfo(&i); - committed = percpu_counter_read_positive(&vm_committed_as); + committed = vm_memory_committed(); cached = global_node_page_state(NR_FILE_PAGES) - total_swapcache_pages() - i.bufferram; @@ -64,6 +54,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); available = si_mem_available(); + sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); + sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); show_val_kb(m, "MemTotal: ", i.totalram); show_val_kb(m, "MemFree: ", i.freeram); @@ -80,7 +72,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); - show_val_kb(m, "Mlocked: ", global_page_state(NR_MLOCK)); + show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK)); #ifdef CONFIG_HIGHMEM show_val_kb(m, "HighTotal: ", i.totalhigh); @@ -96,6 +88,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SwapTotal: ", i.totalswap); show_val_kb(m, "SwapFree: ", i.freeswap); +#ifdef CONFIG_ZSWAP + show_val_kb(m, "Zswap: ", zswap_total_pages()); + seq_printf(m, "Zswapped: %8lu kB\n", + (unsigned long)atomic_long_read(&zswap_stored_pages) << + (PAGE_SHIFT - 10)); +#endif show_val_kb(m, "Dirty: ", global_node_page_state(NR_FILE_DIRTY)); show_val_kb(m, "Writeback: ", @@ -105,34 +103,34 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Mapped: ", global_node_page_state(NR_FILE_MAPPED)); show_val_kb(m, "Shmem: ", i.sharedram); - show_val_kb(m, "Slab: ", - global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE)); - - show_val_kb(m, "SReclaimable: ", - global_page_state(NR_SLAB_RECLAIMABLE)); - show_val_kb(m, "SUnreclaim: ", - global_page_state(NR_SLAB_UNRECLAIMABLE)); + show_val_kb(m, "KReclaimable: ", sreclaimable + + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE)); + show_val_kb(m, "Slab: ", sreclaimable + sunreclaim); + show_val_kb(m, "SReclaimable: ", sreclaimable); + show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", - global_page_state(NR_KERNEL_STACK_KB)); - show_val_kb(m, "PageTables: ", - global_page_state(NR_PAGETABLE)); -#ifdef CONFIG_QUICKLIST - show_val_kb(m, "Quicklists: ", quicklist_total_size()); + global_node_page_state(NR_KERNEL_STACK_KB)); +#ifdef CONFIG_SHADOW_CALL_STACK + seq_printf(m, "ShadowCallStack:%8lu kB\n", + global_node_page_state(NR_KERNEL_SCS_KB)); #endif + show_val_kb(m, "PageTables: ", + global_node_page_state(NR_PAGETABLE)); + show_val_kb(m, "SecPageTables: ", + global_node_page_state(NR_SECONDARY_PAGETABLE)); - show_val_kb(m, "NFS_Unstable: ", - global_node_page_state(NR_UNSTABLE_NFS)); - show_val_kb(m, "Bounce: ", - global_page_state(NR_BOUNCE)); - show_val_kb(m, "WritebackTmp: ", - global_node_page_state(NR_WRITEBACK_TEMP)); + show_val_kb(m, "NFS_Unstable: ", 0); + show_val_kb(m, "Bounce: ", 0); + show_val_kb(m, "WritebackTmp: ", 0); show_val_kb(m, "CommitLimit: ", vm_commit_limit()); show_val_kb(m, "Committed_AS: ", committed); seq_printf(m, "VmallocTotal: %8lu kB\n", (unsigned long)VMALLOC_TOTAL >> 10); - show_val_kb(m, "VmallocUsed: ", 0ul); + show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages()); show_val_kb(m, "VmallocChunk: ", 0ul); + show_val_kb(m, "Percpu: ", pcpu_nr_pages()); + + memtest_report_meminfo(m); #ifdef CONFIG_MEMORY_FAILURE seq_printf(m, "HardwareCorrupted: %5lu kB\n", @@ -141,19 +139,30 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_TRANSPARENT_HUGEPAGE show_val_kb(m, "AnonHugePages: ", - global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR); + global_node_page_state(NR_ANON_THPS)); show_val_kb(m, "ShmemHugePages: ", - global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); + global_node_page_state(NR_SHMEM_THPS)); show_val_kb(m, "ShmemPmdMapped: ", - global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); + global_node_page_state(NR_SHMEM_PMDMAPPED)); + show_val_kb(m, "FileHugePages: ", + global_node_page_state(NR_FILE_THPS)); + show_val_kb(m, "FilePmdMapped: ", + global_node_page_state(NR_FILE_PMDMAPPED)); #endif #ifdef CONFIG_CMA show_val_kb(m, "CmaTotal: ", totalcma_pages); show_val_kb(m, "CmaFree: ", - global_page_state(NR_FREE_CMA_PAGES)); + global_zone_page_state(NR_FREE_CMA_PAGES)); #endif +#ifdef CONFIG_UNACCEPTED_MEMORY + show_val_kb(m, "Unaccepted: ", + global_zone_page_state(NR_UNACCEPTED)); +#endif + show_val_kb(m, "Balloon: ", + global_node_page_state(NR_BALLOON_PAGES)); + hugetlb_report_meminfo(m); arch_report_meminfo(m); @@ -161,21 +170,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v) return 0; } -static int meminfo_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, meminfo_proc_show, NULL); -} - -static const struct file_operations meminfo_proc_fops = { - .open = meminfo_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_meminfo_init(void) { - proc_create("meminfo", 0, NULL, &meminfo_proc_fops); + struct proc_dir_entry *pde; + + pde = proc_create_single("meminfo", 0, NULL, meminfo_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_meminfo_init); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 3803b24ca220..ea2b597fd92c 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/proc_fs.h> #include <linux/nsproxy.h> #include <linux/ptrace.h> @@ -11,7 +12,7 @@ #include "internal.h" -static const struct proc_ns_operations *ns_entries[] = { +static const struct proc_ns_operations *const ns_entries[] = { #ifdef CONFIG_NET_NS &netns_operations, #endif @@ -32,6 +33,10 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_CGROUPS &cgroupns_operations, #endif +#ifdef CONFIG_TIME_NS + &timens_operations, + &timens_for_children_operations, +#endif }; static const char *proc_ns_get_link(struct dentry *dentry, @@ -41,22 +46,26 @@ static const char *proc_ns_get_link(struct dentry *dentry, const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; struct path ns_path; - void *error = ERR_PTR(-EACCES); + int error = -EACCES; if (!dentry) return ERR_PTR(-ECHILD); task = get_proc_task(inode); if (!task) - return error; + return ERR_PTR(-EACCES); - if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { - error = ns_get_path(&ns_path, task, ns_ops); - if (!error) - nd_jump_link(&ns_path); - } + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + goto out; + + error = ns_get_path(&ns_path, task, ns_ops); + if (error) + goto out; + + error = nd_jump_link(&ns_path); +out: put_task_struct(task); - return error; + return ERR_PTR(error); } static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) @@ -74,7 +83,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { res = ns_get_name(name, sizeof(name), task, ns_ops); if (res >= 0) - res = readlink_copy(buffer, buflen, name); + res = readlink_copy(buffer, buflen, name, strlen(name)); } put_task_struct(task); return res; @@ -86,34 +95,29 @@ static const struct inode_operations proc_ns_link_inode_operations = { .setattr = proc_setattr, }; -static int proc_ns_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) +static struct dentry *proc_ns_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) { const struct proc_ns_operations *ns_ops = ptr; struct inode *inode; struct proc_inode *ei; - inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK | S_IRWXUGO); + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO); if (!inode) - goto out; + return ERR_PTR(-ENOENT); ei = PROC_I(inode); inode->i_op = &proc_ns_link_inode_operations; ei->ns_ops = ns_ops; + pid_update_inode(task, inode); - d_set_d_op(dentry, &pid_dentry_operations); - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, 0)) - return 0; -out: - return -ENOENT; + return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) { struct task_struct *task = get_proc_task(file_inode(file)); - const struct proc_ns_operations **entry, **last; + const struct proc_ns_operations *const *entry, *const *last; if (!task) return -ENOENT; @@ -146,12 +150,10 @@ const struct file_operations proc_ns_dir_operations = { static struct dentry *proc_ns_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - int error; struct task_struct *task = get_proc_task(dir); - const struct proc_ns_operations **entry, **last; + const struct proc_ns_operations *const *entry, *const *last; unsigned int len = dentry->d_name.len; - - error = -ENOENT; + struct dentry *res = ERR_PTR(-ENOENT); if (!task) goto out_no_task; @@ -166,11 +168,11 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, if (entry == last) goto out; - error = proc_ns_instantiate(dir, dentry, task, *entry); + res = proc_ns_instantiate(dentry, task, *entry); out: put_task_struct(task); out_no_task: - return ERR_PTR(error); + return res; } const struct inode_operations proc_ns_dir_inode_operations = { diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 75634379f82e..c6e7ebc63756 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* nommu.c: mmu-less memory info files * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/init.h> @@ -25,8 +21,6 @@ #include <linux/seq_file.h> #include <linux/hugetlb.h> #include <linux/vmalloc.h> -#include <linux/uaccess.h> -#include <asm/pgtable.h> #include <asm/tlb.h> #include <asm/div64.h> #include "internal.h" @@ -64,7 +58,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) if (file) { seq_pad(m, ' '); - seq_file_path(m, file, ""); + seq_path(m, file_user_path(file), ""); } seq_putc(m, '\n'); @@ -113,21 +107,9 @@ static const struct seq_operations proc_nommu_region_list_seqop = { .show = nommu_region_list_show }; -static int proc_nommu_region_list_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &proc_nommu_region_list_seqop); -} - -static const struct file_operations proc_nommu_region_list_operations = { - .open = proc_nommu_region_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_nommu_init(void) { - proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations); + proc_create_seq("maps", S_IRUGO, NULL, &proc_nommu_region_list_seqop); return 0; } diff --git a/fs/proc/page.c b/fs/proc/page.c index 2726536489b1..f9b2c2c906cd 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -1,4 +1,5 @@ -#include <linux/bootmem.h> +// SPDX-License-Identifier: GPL-2.0 +#include <linux/memblock.h> #include <linux/compiler.h> #include <linux/fs.h> #include <linux/init.h> @@ -9,6 +10,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/hugetlb.h> +#include <linux/memremap.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> @@ -18,39 +20,89 @@ #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) -#define KPMBITS (KPMSIZE * BITS_PER_BYTE) -/* /proc/kpagecount - an array exposing page counts - * - * Each entry is a u64 representing the corresponding - * physical page count. - */ -static ssize_t kpagecount_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +enum kpage_operation { + KPAGE_FLAGS, + KPAGE_COUNT, + KPAGE_CGROUP, +}; + +static inline unsigned long get_max_dump_pfn(void) +{ +#ifdef CONFIG_SPARSEMEM + /* + * The memmap of early sections is completely populated and marked + * online even if max_pfn does not fall on a section boundary - + * pfn_to_online_page() will succeed on all pages. Allow inspecting + * these memmaps. + */ + return round_up(max_pfn, PAGES_PER_SECTION); +#else + return max_pfn; +#endif +} + +static u64 get_kpage_count(const struct page *page) +{ + struct page_snapshot ps; + u64 ret; + + snapshot_page(&ps, page); + + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + ret = folio_precise_page_mapcount(&ps.folio_snapshot, + &ps.page_snapshot); + else + ret = folio_average_page_mapcount(&ps.folio_snapshot); + + return ret; +} + +static ssize_t kpage_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos, + enum kpage_operation op) { + const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; - struct page *ppage; + struct page *page; unsigned long src = *ppos; unsigned long pfn; ssize_t ret = 0; - u64 pcount; + u64 info; pfn = src / KPMSIZE; - count = min_t(size_t, count, (max_pfn * KPMSIZE) - src); if (src & KPMMASK || count & KPMMASK) return -EINVAL; + if (src >= max_dump_pfn * KPMSIZE) + return 0; + count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); while (count > 0) { - if (pfn_valid(pfn)) - ppage = pfn_to_page(pfn); - else - ppage = NULL; - if (!ppage || PageSlab(ppage)) - pcount = 0; - else - pcount = page_mapcount(ppage); - - if (put_user(pcount, out)) { + /* + * TODO: ZONE_DEVICE support requires to identify + * memmaps that were actually initialized. + */ + page = pfn_to_online_page(pfn); + + if (page) { + switch (op) { + case KPAGE_FLAGS: + info = stable_page_flags(page); + break; + case KPAGE_COUNT: + info = get_kpage_count(page); + break; + case KPAGE_CGROUP: + info = page_cgroup_ino(page); + break; + default: + info = 0; + break; + } + } else + info = 0; + + if (put_user(info, out)) { ret = -EFAULT; break; } @@ -68,26 +120,37 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, return ret; } -static const struct file_operations proc_kpagecount_operations = { - .llseek = mem_lseek, - .read = kpagecount_read, -}; - -/* /proc/kpageflags - an array exposing page flags +/* /proc/kpagecount - an array exposing page mapcounts * * Each entry is a u64 representing the corresponding - * physical page flags. + * physical page mapcount. */ +static ssize_t kpagecount_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return kpage_read(file, buf, count, ppos, KPAGE_COUNT); +} + +static const struct proc_ops kpagecount_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_lseek = mem_lseek, + .proc_read = kpagecount_read, +}; + static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) { return ((kflags >> kbit) & 1) << ubit; } -u64 stable_page_flags(struct page *page) +u64 stable_page_flags(const struct page *page) { - u64 k; - u64 u; + const struct folio *folio; + struct page_snapshot ps; + unsigned long k; + unsigned long mapping; + bool is_anon; + u64 u = 0; /* * pseudo flag: KPF_NOPAGE @@ -96,74 +159,63 @@ u64 stable_page_flags(struct page *page) if (!page) return 1 << KPF_NOPAGE; - k = page->flags; - u = 0; + snapshot_page(&ps, page); + folio = &ps.folio_snapshot; + + k = folio->flags.f; + mapping = (unsigned long)folio->mapping; + is_anon = mapping & FOLIO_MAPPING_ANON; /* * pseudo flags for the well known (anonymous) memory mapped pages - * - * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the - * simple test in page_mapped() is not enough. */ - if (!PageSlab(page) && page_mapped(page)) + if (folio_mapped(folio)) u |= 1 << KPF_MMAP; - if (PageAnon(page)) + if (is_anon) { u |= 1 << KPF_ANON; - if (PageKsm(page)) - u |= 1 << KPF_KSM; + if (mapping & FOLIO_MAPPING_KSM) + u |= 1 << KPF_KSM; + } /* * compound pages: export both head/tail info * they together define a compound page's start/end pos and order */ - if (PageHead(page)) - u |= 1 << KPF_COMPOUND_HEAD; - if (PageTail(page)) + if (ps.idx == 0) + u |= kpf_copy_bit(k, KPF_COMPOUND_HEAD, PG_head); + else u |= 1 << KPF_COMPOUND_TAIL; - if (PageHuge(page)) + if (folio_test_hugetlb(folio)) u |= 1 << KPF_HUGE; - /* - * PageTransCompound can be true for non-huge compound pages (slab - * pages or pages allocated by drivers with __GFP_COMP) because it - * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon - * to make sure a given page is a thp, not a non-huge compound page. - */ - else if (PageTransCompound(page)) { - struct page *head = compound_head(page); - - if (PageLRU(head) || PageAnon(head)) - u |= 1 << KPF_THP; - else if (is_huge_zero_page(head)) { - u |= 1 << KPF_ZERO_PAGE; - u |= 1 << KPF_THP; - } - } else if (is_zero_pfn(page_to_pfn(page))) + else if (folio_test_large(folio) && + folio_test_large_rmappable(folio)) { + /* Note: we indicate any THPs here, not just PMD-sized ones */ + u |= 1 << KPF_THP; + } else if (is_huge_zero_pfn(ps.pfn)) { u |= 1 << KPF_ZERO_PAGE; + u |= 1 << KPF_THP; + } else if (is_zero_pfn(ps.pfn)) { + u |= 1 << KPF_ZERO_PAGE; + } - - /* - * Caveats on high order pages: page->_refcount will only be set - * -1 on the head page; SLUB/SLQB do the same for PG_slab; - * SLOB won't set PG_slab at all on compound pages. - */ - if (PageBuddy(page)) - u |= 1 << KPF_BUDDY; - else if (page_count(page) == 0 && is_free_buddy_page(page)) + if (ps.flags & PAGE_SNAPSHOT_PG_BUDDY) u |= 1 << KPF_BUDDY; - if (PageBalloon(page)) - u |= 1 << KPF_BALLOON; + if (folio_test_offline(folio)) + u |= 1 << KPF_OFFLINE; + if (folio_test_pgtable(folio)) + u |= 1 << KPF_PGTABLE; + if (folio_test_slab(folio)) + u |= 1 << KPF_SLAB; - if (page_is_idle(page)) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) + u |= kpf_copy_bit(k, KPF_IDLE, PG_idle); +#else + if (ps.flags & PAGE_SNAPSHOT_PG_IDLE) u |= 1 << KPF_IDLE; +#endif u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); - - u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); - if (PageTail(page) && PageSlab(compound_head(page))) - u |= 1 << KPF_SLAB; - - u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate); u |= kpf_copy_bit(k, KPF_WRITEBACK, PG_writeback); @@ -173,7 +225,8 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active); u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim); - if (PageSwapCache(page)) +#define SWAPCACHE ((1 << PG_swapbacked) | (1 << PG_swapcache)) + if ((k & SWAPCACHE) == SWAPCACHE) u |= 1 << KPF_SWAPCACHE; u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); @@ -181,123 +234,65 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); #ifdef CONFIG_MEMORY_FAILURE - u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); -#endif - -#ifdef CONFIG_ARCH_USES_PG_UNCACHED - u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); + if (u & (1 << KPF_HUGE)) + u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); + else + u |= kpf_copy_bit(ps.page_snapshot.flags.f, KPF_HWPOISON, PG_hwpoison); #endif u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved); - u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk); + u |= kpf_copy_bit(k, KPF_OWNER_2, PG_owner_2); u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private); u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); +#ifdef CONFIG_ARCH_USES_PG_ARCH_2 + u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2); +#endif +#ifdef CONFIG_ARCH_USES_PG_ARCH_3 + u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3); +#endif return u; -}; +} +EXPORT_SYMBOL_GPL(stable_page_flags); +/* /proc/kpageflags - an array exposing page flags + * + * Each entry is a u64 representing the corresponding + * physical page flags. + */ static ssize_t kpageflags_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { - u64 __user *out = (u64 __user *)buf; - struct page *ppage; - unsigned long src = *ppos; - unsigned long pfn; - ssize_t ret = 0; - - pfn = src / KPMSIZE; - count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); - if (src & KPMMASK || count & KPMMASK) - return -EINVAL; - - while (count > 0) { - if (pfn_valid(pfn)) - ppage = pfn_to_page(pfn); - else - ppage = NULL; - - if (put_user(stable_page_flags(ppage), out)) { - ret = -EFAULT; - break; - } - - pfn++; - out++; - count -= KPMSIZE; - - cond_resched(); - } - - *ppos += (char __user *)out - buf; - if (!ret) - ret = (char __user *)out - buf; - return ret; + return kpage_read(file, buf, count, ppos, KPAGE_FLAGS); } -static const struct file_operations proc_kpageflags_operations = { - .llseek = mem_lseek, - .read = kpageflags_read, +static const struct proc_ops kpageflags_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_lseek = mem_lseek, + .proc_read = kpageflags_read, }; #ifdef CONFIG_MEMCG static ssize_t kpagecgroup_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { - u64 __user *out = (u64 __user *)buf; - struct page *ppage; - unsigned long src = *ppos; - unsigned long pfn; - ssize_t ret = 0; - u64 ino; - - pfn = src / KPMSIZE; - count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); - if (src & KPMMASK || count & KPMMASK) - return -EINVAL; - - while (count > 0) { - if (pfn_valid(pfn)) - ppage = pfn_to_page(pfn); - else - ppage = NULL; - - if (ppage) - ino = page_cgroup_ino(ppage); - else - ino = 0; - - if (put_user(ino, out)) { - ret = -EFAULT; - break; - } - - pfn++; - out++; - count -= KPMSIZE; - - cond_resched(); - } - - *ppos += (char __user *)out - buf; - if (!ret) - ret = (char __user *)out - buf; - return ret; + return kpage_read(file, buf, count, ppos, KPAGE_CGROUP); } - -static const struct file_operations proc_kpagecgroup_operations = { - .llseek = mem_lseek, - .read = kpagecgroup_read, +static const struct proc_ops kpagecgroup_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_lseek = mem_lseek, + .proc_read = kpagecgroup_read, }; #endif /* CONFIG_MEMCG */ static int __init proc_page_init(void) { - proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations); - proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); + proc_create("kpagecount", S_IRUSR, NULL, &kpagecount_proc_ops); + proc_create("kpageflags", S_IRUSR, NULL, &kpageflags_proc_ops); #ifdef CONFIG_MEMCG - proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations); + proc_create("kpagecgroup", S_IRUSR, NULL, &kpagecgroup_proc_ops); #endif return 0; } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index d72fc40241d9..52f0b75cbce2 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/proc/net.c * @@ -7,9 +8,6 @@ * * proc net directory handling functions */ - -#include <linux/uaccess.h> - #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> @@ -38,73 +36,234 @@ static struct net *get_proc_net(const struct inode *inode) return maybe_get_net(PDE_NET(PDE(inode))); } -int seq_open_net(struct inode *ino, struct file *f, - const struct seq_operations *ops, int size) +static int seq_open_net(struct inode *inode, struct file *file) { - struct net *net; + unsigned int state_size = PDE(inode)->state_size; struct seq_net_private *p; + struct net *net; - BUG_ON(size < sizeof(*p)); + WARN_ON_ONCE(state_size < sizeof(*p)); - net = get_proc_net(ino); - if (net == NULL) + if (file->f_mode & FMODE_WRITE && !PDE(inode)->write) + return -EACCES; + + net = get_proc_net(inode); + if (!net) return -ENXIO; - p = __seq_open_private(f, ops, size); - if (p == NULL) { + p = __seq_open_private(file, PDE(inode)->seq_ops, state_size); + if (!p) { put_net(net); return -ENOMEM; } #ifdef CONFIG_NET_NS p->net = net; + netns_tracker_alloc(net, &p->ns_tracker, GFP_KERNEL); #endif return 0; } -EXPORT_SYMBOL_GPL(seq_open_net); -int single_open_net(struct inode *inode, struct file *file, - int (*show)(struct seq_file *, void *)) +static void seq_file_net_put_net(struct seq_file *seq) { - int err; - struct net *net; +#ifdef CONFIG_NET_NS + struct seq_net_private *priv = seq->private; - err = -ENXIO; - net = get_proc_net(inode); - if (net == NULL) - goto err_net; + put_net_track(priv->net, &priv->ns_tracker); +#else + put_net(&init_net); +#endif +} - err = single_open(file, show, net); - if (err < 0) - goto err_open; +static int seq_release_net(struct inode *ino, struct file *f) +{ + struct seq_file *seq = f->private_data; + seq_file_net_put_net(seq); + seq_release_private(ino, f); return 0; +} -err_open: - put_net(net); -err_net: - return err; +static const struct proc_ops proc_net_seq_ops = { + .proc_open = seq_open_net, + .proc_read = seq_read, + .proc_write = proc_simple_write, + .proc_lseek = seq_lseek, + .proc_release = seq_release_net, +}; + +int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux) +{ +#ifdef CONFIG_NET_NS + struct seq_net_private *p = priv_data; + + p->net = get_net_track(current->nsproxy->net_ns, &p->ns_tracker, + GFP_KERNEL); +#endif + return 0; } -EXPORT_SYMBOL_GPL(single_open_net); -int seq_release_net(struct inode *ino, struct file *f) +void bpf_iter_fini_seq_net(void *priv_data) { - struct seq_file *seq; +#ifdef CONFIG_NET_NS + struct seq_net_private *p = priv_data; - seq = f->private_data; + put_net_track(p->net, &p->ns_tracker); +#endif +} - put_net(seq_file_net(seq)); - seq_release_private(ino, f); - return 0; +struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, const struct seq_operations *ops, + unsigned int state_size, void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + pde_force_lookup(p); + p->proc_ops = &proc_net_seq_ops; + p->seq_ops = ops; + p->state_size = state_size; + return proc_register(parent, p); +} +EXPORT_SYMBOL_GPL(proc_create_net_data); + +/** + * proc_create_net_data_write - Create a writable net_ns-specific proc file + * @name: The name of the file. + * @mode: The file's access mode. + * @parent: The parent directory in which to create. + * @ops: The seq_file ops with which to read the file. + * @write: The write method with which to 'modify' the file. + * @state_size: The size of the per-file private state to allocate. + * @data: Data for retrieval by pde_data(). + * + * Create a network namespaced proc file in the @parent directory with the + * specified @name and @mode that allows reading of a file that displays a + * series of elements and also provides for the file accepting writes that have + * some arbitrary effect. + * + * The functions in the @ops table are used to iterate over items to be + * presented and extract the readable content using the seq_file interface. + * + * The @write function is called with the data copied into a kernel space + * scratch buffer and has a NUL appended for convenience. The buffer may be + * modified by the @write function. @write should return 0 on success. + * + * The @data value is accessible from the @show and @write functions by calling + * pde_data() on the file inode. The network namespace must be accessed by + * calling seq_file_net() on the seq_file struct. + */ +struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode, + struct proc_dir_entry *parent, + const struct seq_operations *ops, + proc_write_t write, + unsigned int state_size, void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + pde_force_lookup(p); + p->proc_ops = &proc_net_seq_ops; + p->seq_ops = ops; + p->state_size = state_size; + p->write = write; + return proc_register(parent, p); +} +EXPORT_SYMBOL_GPL(proc_create_net_data_write); + +static int single_open_net(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *de = PDE(inode); + struct net *net; + int err; + + net = get_proc_net(inode); + if (!net) + return -ENXIO; + + err = single_open(file, de->single_show, net); + if (err) + put_net(net); + return err; } -EXPORT_SYMBOL_GPL(seq_release_net); -int single_release_net(struct inode *ino, struct file *f) +static int single_release_net(struct inode *ino, struct file *f) { struct seq_file *seq = f->private_data; put_net(seq->private); return single_release(ino, f); } -EXPORT_SYMBOL_GPL(single_release_net); + +static const struct proc_ops proc_net_single_ops = { + .proc_open = single_open_net, + .proc_read = seq_read, + .proc_write = proc_simple_write, + .proc_lseek = seq_lseek, + .proc_release = single_release_net, +}; + +struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, + struct proc_dir_entry *parent, + int (*show)(struct seq_file *, void *), void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + pde_force_lookup(p); + p->proc_ops = &proc_net_single_ops; + p->single_show = show; + return proc_register(parent, p); +} +EXPORT_SYMBOL_GPL(proc_create_net_single); + +/** + * proc_create_net_single_write - Create a writable net_ns-specific proc file + * @name: The name of the file. + * @mode: The file's access mode. + * @parent: The parent directory in which to create. + * @show: The seqfile show method with which to read the file. + * @write: The write method with which to 'modify' the file. + * @data: Data for retrieval by pde_data(). + * + * Create a network-namespaced proc file in the @parent directory with the + * specified @name and @mode that allows reading of a file that displays a + * single element rather than a series and also provides for the file accepting + * writes that have some arbitrary effect. + * + * The @show function is called to extract the readable content via the + * seq_file interface. + * + * The @write function is called with the data copied into a kernel space + * scratch buffer and has a NUL appended for convenience. The buffer may be + * modified by the @write function. @write should return 0 on success. + * + * The @data value is accessible from the @show and @write functions by calling + * pde_data() on the file inode. The network namespace must be accessed by + * calling seq_file_single_net() on the seq_file struct. + */ +struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode, + struct proc_dir_entry *parent, + int (*show)(struct seq_file *, void *), + proc_write_t write, + void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + pde_force_lookup(p); + p->proc_ops = &proc_net_single_ops; + p->single_show = show; + p->write = write; + return proc_register(parent, p); +} +EXPORT_SYMBOL_GPL(proc_create_net_single_write); static struct net *get_proc_task_net(struct inode *dir) { @@ -135,13 +294,14 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir, de = ERR_PTR(-ENOENT); net = get_proc_task_net(dir); if (net != NULL) { - de = proc_lookup_de(net->proc_net, dir, dentry); + de = proc_lookup_de(dir, dentry, net->proc_net); put_net(net); } return de; } -static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat, +static int proc_tgid_net_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -149,7 +309,7 @@ static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat, net = get_proc_task_net(inode); - generic_fillattr(inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (net != NULL) { stat->nlink = net->proc_net->nlink; @@ -162,6 +322,7 @@ static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat, const struct inode_operations proc_net_inode_operations = { .lookup = proc_tgid_net_lookup, .getattr = proc_tgid_net_getattr, + .setattr = proc_setattr, }; static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) @@ -172,7 +333,7 @@ static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) ret = -EINVAL; net = get_proc_task_net(file_inode(file)); if (net != NULL) { - ret = proc_readdir_de(net->proc_net, file, ctx); + ret = proc_readdir_de(file, ctx, net->proc_net); put_net(net); } return ret; @@ -191,8 +352,14 @@ static __net_init int proc_net_ns_init(struct net *net) kgid_t gid; int err; + /* + * This PDE acts only as an anchor for /proc/${pid}/net hierarchy. + * Corresponding inode (PDE(inode) == net->proc_net) is never + * instantiated therefore blanket zeroing is fine. + * net->proc_net_stat inode is instantiated normally. + */ err = -ENOMEM; - netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL); + netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!netd) goto out; @@ -201,6 +368,7 @@ static __net_init int proc_net_ns_init(struct net *net) netd->nlink = 2; netd->namelen = 3; netd->parent = &proc_root; + netd->name = netd->inline_name; memcpy(netd->name, "net", 4); uid = make_kuid(net->user_ns, 0); @@ -213,6 +381,9 @@ static __net_init int proc_net_ns_init(struct net *net) proc_set_user(netd, uid, gid); + /* Seed dentry revalidation for /proc/${pid}/net */ + pde_force_lookup(netd); + err = -EEXIST; net_statd = proc_net_mkdir(net, "stat", netd); if (!net_statd) @@ -223,7 +394,7 @@ static __net_init int proc_net_ns_init(struct net *net) return 0; free_net: - kfree(netd); + pde_free(netd); out: return err; } @@ -231,7 +402,7 @@ out: static __net_exit void proc_net_ns_exit(struct net *net) { remove_proc_entry("stat", net->proc_net); - kfree(net->proc_net); + pde_free(net->proc_net); } static struct pernet_operations __net_initdata proc_net_ns_ops = { diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 8f479229b349..49ab74e0bfde 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * /proc/sys support */ @@ -11,36 +12,52 @@ #include <linux/cred.h> #include <linux/namei.h> #include <linux/mm.h> +#include <linux/uio.h> #include <linux/module.h> +#include <linux/bpf-cgroup.h> +#include <linux/mount.h> +#include <linux/kmemleak.h> +#include <linux/lockdep.h> #include "internal.h" +#define list_for_each_table_entry(entry, header) \ + entry = header->ctl_table; \ + for (size_t i = 0 ; i < header->ctl_table_size; ++i, entry++) + static const struct dentry_operations proc_sys_dentry_operations; static const struct file_operations proc_sys_file_operations; static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; -/* Support for permanently empty directories */ - -struct ctl_table sysctl_mount_point[] = { +/* + * Support for permanently empty directories. + * Must be non-empty to avoid sharing an address with other tables. + */ +static const struct ctl_table sysctl_mount_point[] = { { } }; -static bool is_empty_dir(struct ctl_table_header *head) -{ - return head->ctl_table[0].child == sysctl_mount_point; -} - -static void set_empty_dir(struct ctl_dir *dir) +/** + * register_sysctl_mount_point() - registers a sysctl mount point + * @path: path for the mount point + * + * Used to create a permanently empty directory to serve as mount point. + * There are some subtle but important permission checks this allows in the + * case of unprivileged mounts. + */ +struct ctl_table_header *register_sysctl_mount_point(const char *path) { - dir->header.ctl_table[0].child = sysctl_mount_point; + return register_sysctl_sz(path, sysctl_mount_point, 0); } +EXPORT_SYMBOL(register_sysctl_mount_point); -static void clear_empty_dir(struct ctl_dir *dir) - -{ - dir->header.ctl_table[0].child = NULL; -} +#define sysctl_is_perm_empty_ctl_header(hptr) \ + (hptr->type == SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY) +#define sysctl_set_perm_empty_ctl_header(hptr) \ + (hptr->type = SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY) +#define sysctl_clear_perm_empty_ctl_header(hptr) \ + (hptr->type = SYSCTL_TABLE_TYPE_DEFAULT) void proc_sys_poll_notify(struct ctl_table_poll *poll) { @@ -51,12 +68,11 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll) wake_up_interruptible(&poll->wait); } -static struct ctl_table root_table[] = { +static const struct ctl_table root_table[] = { { .procname = "", .mode = S_IFDIR|S_IRUGO|S_IXUGO, }, - { } }; static struct ctl_table_root sysctl_table_root = { .default_set.dir.header = { @@ -73,7 +89,7 @@ static DEFINE_SPINLOCK(sysctl_lock); static void drop_sysctl_table(struct ctl_table_header *header); static int sysctl_follow_link(struct ctl_table_header **phead, - struct ctl_table **pentry); + const struct ctl_table **pentry); static int insert_links(struct ctl_table_header *head); static void put_links(struct ctl_table_header *header); @@ -86,27 +102,23 @@ static void sysctl_print_dir(struct ctl_dir *dir) static int namecmp(const char *name1, int len1, const char *name2, int len2) { - int minlen; int cmp; - minlen = len1; - if (minlen > len2) - minlen = len2; - - cmp = memcmp(name1, name2, minlen); + cmp = memcmp(name1, name2, min(len1, len2)); if (cmp == 0) cmp = len1 - len2; return cmp; } -/* Called under sysctl_lock */ -static struct ctl_table *find_entry(struct ctl_table_header **phead, +static const struct ctl_table *find_entry(struct ctl_table_header **phead, struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; - struct ctl_table *entry; + const struct ctl_table *entry; struct rb_node *node = dir->root.rb_node; + lockdep_assert_held(&sysctl_lock); + while (node) { struct ctl_node *ctl_node; @@ -131,7 +143,7 @@ static struct ctl_table *find_entry(struct ctl_table_header **phead, return NULL; } -static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) +static int insert_entry(struct ctl_table_header *head, const struct ctl_table *entry) { struct rb_node *node = &head->node[entry - head->ctl_table].node; struct rb_node **p = &head->parent->root.rb_node; @@ -141,7 +153,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) while (*p) { struct ctl_table_header *parent_head; - struct ctl_table *parent_entry; + const struct ctl_table *parent_entry; struct ctl_node *parent_node; const char *parent_name; int cmp; @@ -160,7 +172,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) else { pr_err("sysctl duplicate entry: "); sysctl_print_dir(head->parent); - pr_cont("/%s\n", entry->procname); + pr_cont("%s\n", entry->procname); return -EEXIST; } } @@ -170,7 +182,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) return 0; } -static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry) +static void erase_entry(struct ctl_table_header *head, const struct ctl_table *entry) { struct rb_node *node = &head->node[entry - head->ctl_table].node; @@ -179,9 +191,10 @@ static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry) static void init_header(struct ctl_table_header *head, struct ctl_table_root *root, struct ctl_table_set *set, - struct ctl_node *node, struct ctl_table *table) + struct ctl_node *node, const struct ctl_table *table, size_t table_size) { head->ctl_table = table; + head->ctl_table_size = table_size; head->ctl_table_arg = table; head->used = 0; head->count = 1; @@ -193,41 +206,49 @@ static void init_header(struct ctl_table_header *head, head->node = node; INIT_HLIST_HEAD(&head->inodes); if (node) { - struct ctl_table *entry; - for (entry = table; entry->procname; entry++, node++) + const struct ctl_table *entry; + + list_for_each_table_entry(entry, head) { node->header = head; + node++; + } } + if (table == sysctl_mount_point) + sysctl_set_perm_empty_ctl_header(head); } static void erase_header(struct ctl_table_header *head) { - struct ctl_table *entry; - for (entry = head->ctl_table; entry->procname; entry++) + const struct ctl_table *entry; + + list_for_each_table_entry(entry, head) erase_entry(head, entry); } static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) { - struct ctl_table *entry; + const struct ctl_table *entry; + struct ctl_table_header *dir_h = &dir->header; int err; + /* Is this a permanently empty directory? */ - if (is_empty_dir(&dir->header)) + if (sysctl_is_perm_empty_ctl_header(dir_h)) return -EROFS; /* Am I creating a permanently empty directory? */ - if (header->ctl_table == sysctl_mount_point) { + if (sysctl_is_perm_empty_ctl_header(header)) { if (!RB_EMPTY_ROOT(&dir->root)) return -EINVAL; - set_empty_dir(dir); + sysctl_set_perm_empty_ctl_header(dir_h); } - dir->header.nreg++; + dir_h->nreg++; header->parent = dir; err = insert_links(header); if (err) goto fail_links; - for (entry = header->ctl_table; entry->procname; entry++) { + list_for_each_table_entry(entry, header) { err = insert_entry(header, entry); if (err) goto fail; @@ -238,70 +259,41 @@ fail: put_links(header); fail_links: if (header->ctl_table == sysctl_mount_point) - clear_empty_dir(dir); + sysctl_clear_perm_empty_ctl_header(dir_h); header->parent = NULL; - drop_sysctl_table(&dir->header); + drop_sysctl_table(dir_h); return err; } -/* called under sysctl_lock */ static int use_table(struct ctl_table_header *p) { + lockdep_assert_held(&sysctl_lock); + if (unlikely(p->unregistering)) return 0; p->used++; return 1; } -/* called under sysctl_lock */ static void unuse_table(struct ctl_table_header *p) { + lockdep_assert_held(&sysctl_lock); + if (!--p->used) if (unlikely(p->unregistering)) complete(p->unregistering); } -static void proc_sys_prune_dcache(struct ctl_table_header *head) +static void proc_sys_invalidate_dcache(struct ctl_table_header *head) { - struct inode *inode; - struct proc_inode *ei; - struct hlist_node *node; - struct super_block *sb; - - rcu_read_lock(); - for (;;) { - node = hlist_first_rcu(&head->inodes); - if (!node) - break; - ei = hlist_entry(node, struct proc_inode, sysctl_inodes); - spin_lock(&sysctl_lock); - hlist_del_init_rcu(&ei->sysctl_inodes); - spin_unlock(&sysctl_lock); - - inode = &ei->vfs_inode; - sb = inode->i_sb; - if (!atomic_inc_not_zero(&sb->s_active)) - continue; - inode = igrab(inode); - rcu_read_unlock(); - if (unlikely(!inode)) { - deactivate_super(sb); - rcu_read_lock(); - continue; - } - - d_prune_aliases(inode); - iput(inode); - deactivate_super(sb); - - rcu_read_lock(); - } - rcu_read_unlock(); + proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock); } -/* called under sysctl_lock, will reacquire if has to wait */ static void start_unregistering(struct ctl_table_header *p) { + /* will reacquire if has to wait */ + lockdep_assert_held(&sysctl_lock); + /* * if p->used is 0, nobody will ever touch that entry again; * we'll eliminate all paths to it before dropping sysctl_lock @@ -318,10 +310,10 @@ static void start_unregistering(struct ctl_table_header *p) spin_unlock(&sysctl_lock); } /* - * Prune dentries for unregistered sysctls: namespaced sysctls + * Invalidate dentries for unregistered sysctls: namespaced sysctls * can have duplicate names and contaminate dcache very badly. */ - proc_sys_prune_dcache(p); + proc_sys_invalidate_dcache(p); /* * do not remove from the list until nobody holds it; walking the * list in do_sysctl() relies on that. @@ -358,12 +350,12 @@ lookup_header_set(struct ctl_table_root *root) return set; } -static struct ctl_table *lookup_entry(struct ctl_table_header **phead, - struct ctl_dir *dir, - const char *name, int namelen) +static const struct ctl_table *lookup_entry(struct ctl_table_header **phead, + struct ctl_dir *dir, + const char *name, int namelen) { struct ctl_table_header *head; - struct ctl_table *entry; + const struct ctl_table *entry; spin_lock(&sysctl_lock); entry = find_entry(&head, dir, name, namelen); @@ -388,10 +380,10 @@ static struct ctl_node *first_usable_entry(struct rb_node *node) } static void first_entry(struct ctl_dir *dir, - struct ctl_table_header **phead, struct ctl_table **pentry) + struct ctl_table_header **phead, const struct ctl_table **pentry) { struct ctl_table_header *head = NULL; - struct ctl_table *entry = NULL; + const struct ctl_table *entry = NULL; struct ctl_node *ctl_node; spin_lock(&sysctl_lock); @@ -405,10 +397,10 @@ static void first_entry(struct ctl_dir *dir, *pentry = entry; } -static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry) +static void next_entry(struct ctl_table_header **phead, const struct ctl_table **pentry) { struct ctl_table_header *head = *phead; - struct ctl_table *entry = *pentry; + const struct ctl_table *entry = *pentry; struct ctl_node *ctl_node = &head->node[entry - head->ctl_table]; spin_lock(&sysctl_lock); @@ -441,7 +433,7 @@ static int test_perm(int mode, int op) return -EACCES; } -static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op) +static int sysctl_perm(struct ctl_table_header *head, const struct ctl_table *table, int op) { struct ctl_table_root *root = head->root; int mode; @@ -455,7 +447,7 @@ static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, i } static struct inode *proc_sys_make_inode(struct super_block *sb, - struct ctl_table_header *head, struct ctl_table *table) + struct ctl_table_header *head, const struct ctl_table *table) { struct ctl_table_root *root = head->root; struct inode *inode; @@ -463,7 +455,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode = new_inode(sb); if (!inode) - goto out; + return ERR_PTR(-ENOMEM); inode->i_ino = get_next_ino(); @@ -473,16 +465,15 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, if (unlikely(head->unregistering)) { spin_unlock(&sysctl_lock); iput(inode); - inode = NULL; - goto out; + return ERR_PTR(-ENOENT); } ei->sysctl = head; ei->sysctl_entry = table; - hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes); + hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes); head->count++; spin_unlock(&sysctl_lock); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_mode = table->mode; if (!S_ISDIR(table->mode)) { inode->i_mode |= S_IFREG; @@ -492,21 +483,22 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode->i_mode |= S_IFDIR; inode->i_op = &proc_sys_dir_operations; inode->i_fop = &proc_sys_dir_file_operations; - if (is_empty_dir(head)) + if (sysctl_is_perm_empty_ctl_header(head)) make_empty_dir_inode(inode); } + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; if (root->set_ownership) - root->set_ownership(head, table, &inode->i_uid, &inode->i_gid); + root->set_ownership(head, &inode->i_uid, &inode->i_gid); -out: return inode; } void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) { spin_lock(&sysctl_lock); - hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes); + hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes); if (!--head->count) kfree_rcu(head, rcu); spin_unlock(&sysctl_lock); @@ -526,7 +518,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, struct ctl_table_header *head = grab_header(dir); struct ctl_table_header *h = NULL; const struct qstr *name = &dentry->d_name; - struct ctl_table *p; + const struct ctl_table *p; struct inode *inode; struct dentry *err = ERR_PTR(-ENOENT); struct ctl_dir *ctl_dir; @@ -548,14 +540,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, goto out; } - err = ERR_PTR(-ENOMEM); inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); - if (!inode) - goto out; - - err = NULL; - d_set_d_op(dentry, &proc_sys_dentry_operations); - d_add(dentry, inode); + err = d_splice_alias_ops(inode, dentry, &proc_sys_dentry_operations); out: if (h) @@ -564,14 +550,15 @@ out: return err; } -static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, - size_t count, loff_t *ppos, int write) +static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, + int write) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(iocb->ki_filp); struct ctl_table_header *head = grab_header(inode); - struct ctl_table *table = PROC_I(inode)->sysctl_entry; + const struct ctl_table *table = PROC_I(inode)->sysctl_entry; + size_t count = iov_iter_count(iter); + char *kbuf; ssize_t error; - size_t res; if (IS_ERR(head)) return PTR_ERR(head); @@ -589,33 +576,60 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, if (!table->proc_handler) goto out; + /* don't even try if the size is too large */ + error = -ENOMEM; + if (count >= KMALLOC_MAX_SIZE) + goto out; + kbuf = kvzalloc(count + 1, GFP_KERNEL); + if (!kbuf) + goto out; + + if (write) { + error = -EFAULT; + if (!copy_from_iter_full(kbuf, count, iter)) + goto out_free_buf; + kbuf[count] = '\0'; + } + + error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count, + &iocb->ki_pos); + if (error) + goto out_free_buf; + /* careful: calling conventions are nasty here */ - res = count; - error = table->proc_handler(table, write, buf, &res, ppos); - if (!error) - error = res; + error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos); + if (error) + goto out_free_buf; + + if (!write) { + error = -EFAULT; + if (copy_to_iter(kbuf, count, iter) < count) + goto out_free_buf; + } + + error = count; +out_free_buf: + kvfree(kbuf); out: sysctl_head_finish(head); return error; } -static ssize_t proc_sys_read(struct file *filp, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter) { - return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0); + return proc_sys_call_handler(iocb, iter, 0); } -static ssize_t proc_sys_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter) { - return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1); + return proc_sys_call_handler(iocb, iter, 1); } static int proc_sys_open(struct inode *inode, struct file *filp) { struct ctl_table_header *head = grab_header(inode); - struct ctl_table *table = PROC_I(inode)->sysctl_entry; + const struct ctl_table *table = PROC_I(inode)->sysctl_entry; /* sysctl was unregistered */ if (IS_ERR(head)) @@ -629,17 +643,17 @@ static int proc_sys_open(struct inode *inode, struct file *filp) return 0; } -static unsigned int proc_sys_poll(struct file *filp, poll_table *wait) +static __poll_t proc_sys_poll(struct file *filp, poll_table *wait) { struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); - struct ctl_table *table = PROC_I(inode)->sysctl_entry; - unsigned int ret = DEFAULT_POLLMASK; + const struct ctl_table *table = PROC_I(inode)->sysctl_entry; + __poll_t ret = DEFAULT_POLLMASK; unsigned long event; /* sysctl was unregistered */ if (IS_ERR(head)) - return POLLERR | POLLHUP; + return EPOLLERR | EPOLLHUP; if (!table->proc_handler) goto out; @@ -652,7 +666,7 @@ static unsigned int proc_sys_poll(struct file *filp, poll_table *wait) if (event != atomic_read(&table->poll->event)) { filp->private_data = proc_sys_poll_event(table->poll); - ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI; + ret = EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; } out: @@ -664,7 +678,7 @@ out: static bool proc_sys_fill_cache(struct file *file, struct dir_context *ctx, struct ctl_table_header *head, - struct ctl_table *table) + const struct ctl_table *table) { struct dentry *child, *dir = file->f_path.dentry; struct inode *inode; @@ -683,14 +697,19 @@ static bool proc_sys_fill_cache(struct file *file, if (IS_ERR(child)) return false; if (d_in_lookup(child)) { + struct dentry *res; inode = proc_sys_make_inode(dir->d_sb, head, table); - if (!inode) { - d_lookup_done(child); + res = d_splice_alias_ops(inode, child, + &proc_sys_dentry_operations); + d_lookup_done(child); + if (unlikely(res)) { dput(child); - return false; + + if (IS_ERR(res)) + return false; + + child = res; } - d_set_d_op(child, &proc_sys_dentry_operations); - d_add(child, inode); } } inode = d_inode(child); @@ -703,17 +722,17 @@ static bool proc_sys_fill_cache(struct file *file, static bool proc_sys_link_fill_cache(struct file *file, struct dir_context *ctx, struct ctl_table_header *head, - struct ctl_table *table) + const struct ctl_table *table) { bool ret = true; + head = sysctl_head_grab(head); + if (IS_ERR(head)) + return false; - if (S_ISLNK(table->mode)) { - /* It is not an error if we can not follow the link ignore it */ - int err = sysctl_follow_link(&head, &table); - if (err) - goto out; - } + /* It is not an error if we can not follow the link ignore it */ + if (sysctl_follow_link(&head, &table)) + goto out; ret = proc_sys_fill_cache(file, ctx, head, table); out: @@ -721,7 +740,7 @@ out: return ret; } -static int scan(struct ctl_table_header *head, struct ctl_table *table, +static int scan(struct ctl_table_header *head, const struct ctl_table *table, unsigned long *pos, struct file *file, struct dir_context *ctx) { @@ -745,7 +764,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx) { struct ctl_table_header *head = grab_header(file_inode(file)); struct ctl_table_header *h = NULL; - struct ctl_table *entry; + const struct ctl_table *entry; struct ctl_dir *ctl_dir; unsigned long pos; @@ -770,14 +789,15 @@ out: return 0; } -static int proc_sys_permission(struct inode *inode, int mask) +static int proc_sys_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask) { /* * sysctl entries that are not writeable, * are _NOT_ writeable, capabilities or not. */ struct ctl_table_header *head; - struct ctl_table *table; + const struct ctl_table *table; int error; /* Executable files are not allowed under /proc/sys/ */ @@ -798,7 +818,8 @@ static int proc_sys_permission(struct inode *inode, int mask) return error; } -static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) +static int proc_sys_setattr(struct mnt_idmap *idmap, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; @@ -806,26 +827,26 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) return -EPERM; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; - setattr_copy(inode, attr); - mark_inode_dirty(inode); + setattr_copy(&nop_mnt_idmap, inode, attr); return 0; } -static int proc_sys_getattr(const struct path *path, struct kstat *stat, +static int proc_sys_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct ctl_table_header *head = grab_header(inode); - struct ctl_table *table = PROC_I(inode)->sysctl_entry; + const struct ctl_table *table = PROC_I(inode)->sysctl_entry; if (IS_ERR(head)) return PTR_ERR(head); - generic_fillattr(inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (table) stat->mode = (stat->mode & S_IFMT) | table->mode; @@ -836,8 +857,10 @@ static int proc_sys_getattr(const struct path *path, struct kstat *stat, static const struct file_operations proc_sys_file_operations = { .open = proc_sys_open, .poll = proc_sys_poll, - .read = proc_sys_read, - .write = proc_sys_write, + .read_iter = proc_sys_read, + .write_iter = proc_sys_write, + .splice_read = copy_splice_read, + .splice_write = iter_file_splice_write, .llseek = default_llseek, }; @@ -860,7 +883,8 @@ static const struct inode_operations proc_sys_dir_operations = { .getattr = proc_sys_getattr, }; -static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags) +static int proc_sys_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; @@ -893,17 +917,21 @@ static int proc_sys_compare(const struct dentry *dentry, struct ctl_table_header *head; struct inode *inode; - /* Although proc doesn't have negative dentries, rcu-walk means - * that inode here can be NULL */ - /* AV: can it, indeed? */ - inode = d_inode_rcu(dentry); - if (!inode) - return 1; if (name->len != len) return 1; if (memcmp(name->name, str, len)) return 1; - head = rcu_dereference(PROC_I(inode)->sysctl); + + // false positive is fine here - we'll recheck anyway + if (d_in_lookup(dentry)) + return 0; + + inode = d_inode_rcu(dentry); + // we just might have run into dentry in the middle of __dentry_kill() + if (!inode) + return 1; + + head = READ_ONCE(PROC_I(inode)->sysctl); return !head || !sysctl_is_seen(head); } @@ -917,7 +945,7 @@ static struct ctl_dir *find_subdir(struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; - struct ctl_table *entry; + const struct ctl_table *entry; entry = find_entry(&head, dir, name, namelen); if (!entry) @@ -936,19 +964,18 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set, char *new_name; new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) + - sizeof(struct ctl_table)*2 + namelen + 1, + sizeof(struct ctl_table) + namelen + 1, GFP_KERNEL); if (!new) return NULL; node = (struct ctl_node *)(new + 1); table = (struct ctl_table *)(node + 1); - new_name = (char *)(table + 2); + new_name = (char *)(table + 1); memcpy(new_name, name, namelen); - new_name[namelen] = '\0'; table[0].procname = new_name; table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; - init_header(&new->header, set->dir.header.root, set, node, table); + init_header(&new->header, set->dir.header.root, set, node, table, 1); return new; } @@ -1005,8 +1032,8 @@ failed: if (IS_ERR(subdir)) { pr_err("sysctl could not get directory: "); sysctl_print_dir(dir); - pr_cont("/%*.*s %ld\n", - namelen, namelen, name, PTR_ERR(subdir)); + pr_cont("%*.*s %ld\n", namelen, namelen, name, + PTR_ERR(subdir)); } drop_sysctl_table(&dir->header); if (new) @@ -1029,16 +1056,15 @@ static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir) } static int sysctl_follow_link(struct ctl_table_header **phead, - struct ctl_table **pentry) + const struct ctl_table **pentry) { struct ctl_table_header *head; + const struct ctl_table *entry; struct ctl_table_root *root; struct ctl_table_set *set; - struct ctl_table *entry; struct ctl_dir *dir; int ret; - ret = 0; spin_lock(&sysctl_lock); root = (*pentry)->data; set = lookup_header_set(root); @@ -1062,7 +1088,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead, return ret; } -static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) +static int sysctl_err(const char *path, const struct ctl_table *table, char *fmt, ...) { struct va_format vaf; va_list args; @@ -1078,72 +1104,96 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) return -EINVAL; } -static int sysctl_check_table_array(const char *path, struct ctl_table *table) +static int sysctl_check_table_array(const char *path, const struct ctl_table *table) { + unsigned int extra; int err = 0; if ((table->proc_handler == proc_douintvec) || (table->proc_handler == proc_douintvec_minmax)) { if (table->maxlen != sizeof(unsigned int)) - err |= sysctl_err(path, table, "array now allowed"); + err |= sysctl_err(path, table, "array not allowed"); + } + + if (table->proc_handler == proc_dou8vec_minmax) { + if (table->maxlen != sizeof(u8)) + err |= sysctl_err(path, table, "array not allowed"); + + if (table->extra1) { + extra = *(unsigned int *) table->extra1; + if (extra > 255U) + err |= sysctl_err(path, table, + "range value too large for proc_dou8vec_minmax"); + } + if (table->extra2) { + extra = *(unsigned int *) table->extra2; + if (extra > 255U) + err |= sysctl_err(path, table, + "range value too large for proc_dou8vec_minmax"); + } + } + + if (table->proc_handler == proc_dobool) { + if (table->maxlen != sizeof(bool)) + err |= sysctl_err(path, table, "array not allowed"); } return err; } -static int sysctl_check_table(const char *path, struct ctl_table *table) +static int sysctl_check_table(const char *path, struct ctl_table_header *header) { + const struct ctl_table *entry; int err = 0; - for (; table->procname; table++) { - if (table->child) - err |= sysctl_err(path, table, "Not a file"); - - if ((table->proc_handler == proc_dostring) || - (table->proc_handler == proc_dointvec) || - (table->proc_handler == proc_douintvec) || - (table->proc_handler == proc_douintvec_minmax) || - (table->proc_handler == proc_dointvec_minmax) || - (table->proc_handler == proc_dointvec_jiffies) || - (table->proc_handler == proc_dointvec_userhz_jiffies) || - (table->proc_handler == proc_dointvec_ms_jiffies) || - (table->proc_handler == proc_doulongvec_minmax) || - (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { - if (!table->data) - err |= sysctl_err(path, table, "No data"); - if (!table->maxlen) - err |= sysctl_err(path, table, "No maxlen"); + list_for_each_table_entry(entry, header) { + if (!entry->procname) + err |= sysctl_err(path, entry, "procname is null"); + if ((entry->proc_handler == proc_dostring) || + (entry->proc_handler == proc_dobool) || + (entry->proc_handler == proc_dointvec) || + (entry->proc_handler == proc_douintvec) || + (entry->proc_handler == proc_douintvec_minmax) || + (entry->proc_handler == proc_dointvec_minmax) || + (entry->proc_handler == proc_dou8vec_minmax) || + (entry->proc_handler == proc_dointvec_jiffies) || + (entry->proc_handler == proc_dointvec_userhz_jiffies) || + (entry->proc_handler == proc_dointvec_ms_jiffies) || + (entry->proc_handler == proc_doulongvec_minmax) || + (entry->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { + if (!entry->data) + err |= sysctl_err(path, entry, "No data"); + if (!entry->maxlen) + err |= sysctl_err(path, entry, "No maxlen"); else - err |= sysctl_check_table_array(path, table); + err |= sysctl_check_table_array(path, entry); } - if (!table->proc_handler) - err |= sysctl_err(path, table, "No proc_handler"); + if (!entry->proc_handler) + err |= sysctl_err(path, entry, "No proc_handler"); - if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode) - err |= sysctl_err(path, table, "bogus .mode 0%o", - table->mode); + if ((entry->mode & (S_IRUGO|S_IWUGO)) != entry->mode) + err |= sysctl_err(path, entry, "bogus .mode 0%o", + entry->mode); } return err; } -static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table, - struct ctl_table_root *link_root) +static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_header *head) { - struct ctl_table *link_table, *entry, *link; + struct ctl_table *link_table, *link; struct ctl_table_header *links; + const struct ctl_table *entry; struct ctl_node *node; char *link_name; - int nr_entries, name_bytes; + int name_bytes; name_bytes = 0; - nr_entries = 0; - for (entry = table; entry->procname; entry++) { - nr_entries++; + list_for_each_table_entry(entry, head) { name_bytes += strlen(entry->procname) + 1; } links = kzalloc(sizeof(struct ctl_table_header) + - sizeof(struct ctl_node)*nr_entries + - sizeof(struct ctl_table)*(nr_entries + 1) + + sizeof(struct ctl_node)*head->ctl_table_size + + sizeof(struct ctl_table)*head->ctl_table_size + name_bytes, GFP_KERNEL); @@ -1151,33 +1201,41 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table return NULL; node = (struct ctl_node *)(links + 1); - link_table = (struct ctl_table *)(node + nr_entries); - link_name = (char *)&link_table[nr_entries + 1]; + link_table = (struct ctl_table *)(node + head->ctl_table_size); + link_name = (char *)(link_table + head->ctl_table_size); + link = link_table; - for (link = link_table, entry = table; entry->procname; link++, entry++) { + list_for_each_table_entry(entry, head) { int len = strlen(entry->procname) + 1; memcpy(link_name, entry->procname, len); link->procname = link_name; link->mode = S_IFLNK|S_IRWXUGO; - link->data = link_root; + link->data = head->root; link_name += len; + link++; } - init_header(links, dir->header.root, dir->header.set, node, link_table); - links->nreg = nr_entries; + init_header(links, dir->header.root, dir->header.set, node, link_table, + head->ctl_table_size); + links->nreg = head->ctl_table_size; return links; } static bool get_links(struct ctl_dir *dir, - struct ctl_table *table, struct ctl_table_root *link_root) + struct ctl_table_header *header, + struct ctl_table_root *link_root) { - struct ctl_table_header *head; - struct ctl_table *entry, *link; + struct ctl_table_header *tmp_head; + const struct ctl_table *entry, *link; + + if (header->ctl_table_size == 0 || + sysctl_is_perm_empty_ctl_header(header)) + return true; /* Are there links available for every entry in table? */ - for (entry = table; entry->procname; entry++) { + list_for_each_table_entry(entry, header) { const char *procname = entry->procname; - link = find_entry(&head, dir, procname, strlen(procname)); + link = find_entry(&tmp_head, dir, procname, strlen(procname)); if (!link) return false; if (S_ISDIR(link->mode) && S_ISDIR(entry->mode)) @@ -1188,10 +1246,10 @@ static bool get_links(struct ctl_dir *dir, } /* The checks passed. Increase the registration count on the links */ - for (entry = table; entry->procname; entry++) { + list_for_each_table_entry(entry, header) { const char *procname = entry->procname; - link = find_entry(&head, dir, procname, strlen(procname)); - head->nreg++; + link = find_entry(&tmp_head, dir, procname, strlen(procname)); + tmp_head->nreg++; } return true; } @@ -1199,7 +1257,7 @@ static bool get_links(struct ctl_dir *dir, static int insert_links(struct ctl_table_header *head) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; - struct ctl_dir *core_parent = NULL; + struct ctl_dir *core_parent; struct ctl_table_header *links; int err; @@ -1210,13 +1268,13 @@ static int insert_links(struct ctl_table_header *head) if (IS_ERR(core_parent)) return 0; - if (get_links(core_parent, head->ctl_table, head->root)) + if (get_links(core_parent, head, head->root)) return 0; core_parent->header.nreg++; spin_unlock(&sysctl_lock); - links = new_links(core_parent, head->ctl_table, head->root); + links = new_links(core_parent, head); spin_lock(&sysctl_lock); err = -ENOMEM; @@ -1224,7 +1282,7 @@ static int insert_links(struct ctl_table_header *head) goto out; err = 0; - if (get_links(core_parent, head->ctl_table, head->root)) { + if (get_links(core_parent, head, head->root)) { kfree(links); goto out; } @@ -1237,34 +1295,64 @@ out: return err; } +/* Find the directory for the ctl_table. If one is not found create it. */ +static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path) +{ + const char *name, *nextname; + + for (name = path; name; name = nextname) { + int namelen; + nextname = strchr(name, '/'); + if (nextname) { + namelen = nextname - name; + nextname++; + } else { + namelen = strlen(name); + } + if (namelen == 0) + continue; + + /* + * namelen ensures if name is "foo/bar/yay" only foo is + * registered first. We traverse as if using mkdir -p and + * return a ctl_dir for the last directory entry. + */ + dir = get_subdir(dir, name, namelen); + if (IS_ERR(dir)) + break; + } + return dir; +} + /** * __register_sysctl_table - register a leaf sysctl table * @set: Sysctl tree to register on * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure + * + * @table: the top-level table structure. This table should not be free'd + * after registration. So it should not be used on stack. It can either + * be a global or dynamically allocated by the caller and free'd later + * after sysctl unregistration. + * @table_size : The number of elements in table * * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. + * array. * * The members of the &struct ctl_table structure are used as follows: - * * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not * enter a sysctl file - * - * data - a pointer to data for use by proc_handler - * - * maxlen - the maximum size in bytes of the data - * - * mode - the file permissions for the /proc/sys file - * - * child - must be %NULL. - * + * data - a pointer to data for use by proc_handler + * maxlen - the maximum size in bytes of the data + * mode - the file permissions for the /proc/sys file + * type - Defines the target type (described in struct definition) * proc_handler - the text handler routine (described below) * * extra1, extra2 - extra pointers usable by the proc handler routines + * XXX: we should eventually modify these to use long min / max [0] + * [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org * * Leaf nodes in the sysctl tree will be represented by a single file - * under /proc; non-leaf nodes will be represented by directories. + * under /proc; non-leaf nodes are not allowed. * * There must be a proc_handler routine for any terminal nodes. * Several default handlers are available to cover common cases - @@ -1281,53 +1369,32 @@ out: */ struct ctl_table_header *__register_sysctl_table( struct ctl_table_set *set, - const char *path, struct ctl_table *table) + const char *path, const struct ctl_table *table, size_t table_size) { struct ctl_table_root *root = set->dir.header.root; struct ctl_table_header *header; - const char *name, *nextname; struct ctl_dir *dir; - struct ctl_table *entry; struct ctl_node *node; - int nr_entries = 0; - - for (entry = table; entry->procname; entry++) - nr_entries++; header = kzalloc(sizeof(struct ctl_table_header) + - sizeof(struct ctl_node)*nr_entries, GFP_KERNEL); + sizeof(struct ctl_node)*table_size, GFP_KERNEL_ACCOUNT); if (!header) return NULL; node = (struct ctl_node *)(header + 1); - init_header(header, root, set, node, table); - if (sysctl_check_table(path, table)) + init_header(header, root, set, node, table, table_size); + if (sysctl_check_table(path, header)) goto fail; spin_lock(&sysctl_lock); dir = &set->dir; - /* Reference moved down the diretory tree get_subdir */ + /* Reference moved down the directory tree get_subdir */ dir->header.nreg++; spin_unlock(&sysctl_lock); - /* Find the directory for the ctl_table */ - for (name = path; name; name = nextname) { - int namelen; - nextname = strchr(name, '/'); - if (nextname) { - namelen = nextname - name; - nextname++; - } else { - namelen = strlen(name); - } - if (namelen == 0) - continue; - - dir = get_subdir(dir, name, namelen); - if (IS_ERR(dir)) - goto fail; - } - + dir = sysctl_mkdir_p(dir, path); + if (IS_ERR(dir)) + goto fail; spin_lock(&sysctl_lock); if (insert_header(dir, header)) goto fail_put_dir_locked; @@ -1342,239 +1409,67 @@ fail_put_dir_locked: spin_unlock(&sysctl_lock); fail: kfree(header); - dump_stack(); return NULL; } /** - * register_sysctl - register a sysctl table - * @path: The path to the directory the sysctl table is in. - * @table: the table structure + * register_sysctl_sz - register a sysctl table + * @path: The path to the directory the sysctl table is in. If the path + * doesn't exist we will create it for you. + * @table: the table structure. The calller must ensure the life of the @table + * will be kept during the lifetime use of the syctl. It must not be freed + * until unregister_sysctl_table() is called with the given returned table + * with this registration. If your code is non modular then you don't need + * to call unregister_sysctl_table() and can instead use something like + * register_sysctl_init() which does not care for the result of the syctl + * registration. + * @table_size: The number of elements in table. * * Register a sysctl table. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. * * See __register_sysctl_table for more details. */ -struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table) +struct ctl_table_header *register_sysctl_sz(const char *path, const struct ctl_table *table, + size_t table_size) { return __register_sysctl_table(&sysctl_table_root.default_set, - path, table); -} -EXPORT_SYMBOL(register_sysctl); - -static char *append_path(const char *path, char *pos, const char *name) -{ - int namelen; - namelen = strlen(name); - if (((pos - path) + namelen + 2) >= PATH_MAX) - return NULL; - memcpy(pos, name, namelen); - pos[namelen] = '/'; - pos[namelen + 1] = '\0'; - pos += namelen + 1; - return pos; -} - -static int count_subheaders(struct ctl_table *table) -{ - int has_files = 0; - int nr_subheaders = 0; - struct ctl_table *entry; - - /* special case: no directory and empty directory */ - if (!table || !table->procname) - return 1; - - for (entry = table; entry->procname; entry++) { - if (entry->child) - nr_subheaders += count_subheaders(entry->child); - else - has_files = 1; - } - return nr_subheaders + has_files; -} - -static int register_leaf_sysctl_tables(const char *path, char *pos, - struct ctl_table_header ***subheader, struct ctl_table_set *set, - struct ctl_table *table) -{ - struct ctl_table *ctl_table_arg = NULL; - struct ctl_table *entry, *files; - int nr_files = 0; - int nr_dirs = 0; - int err = -ENOMEM; - - for (entry = table; entry->procname; entry++) { - if (entry->child) - nr_dirs++; - else - nr_files++; - } - - files = table; - /* If there are mixed files and directories we need a new table */ - if (nr_dirs && nr_files) { - struct ctl_table *new; - files = kzalloc(sizeof(struct ctl_table) * (nr_files + 1), - GFP_KERNEL); - if (!files) - goto out; - - ctl_table_arg = files; - for (new = files, entry = table; entry->procname; entry++) { - if (entry->child) - continue; - *new = *entry; - new++; - } - } - - /* Register everything except a directory full of subdirectories */ - if (nr_files || !nr_dirs) { - struct ctl_table_header *header; - header = __register_sysctl_table(set, path, files); - if (!header) { - kfree(ctl_table_arg); - goto out; - } - - /* Remember if we need to free the file table */ - header->ctl_table_arg = ctl_table_arg; - **subheader = header; - (*subheader)++; - } - - /* Recurse into the subdirectories. */ - for (entry = table; entry->procname; entry++) { - char *child_pos; - - if (!entry->child) - continue; - - err = -ENAMETOOLONG; - child_pos = append_path(path, pos, entry->procname); - if (!child_pos) - goto out; - - err = register_leaf_sysctl_tables(path, child_pos, subheader, - set, entry->child); - pos[0] = '\0'; - if (err) - goto out; - } - err = 0; -out: - /* On failure our caller will unregister all registered subheaders */ - return err; + path, table, table_size); } +EXPORT_SYMBOL(register_sysctl_sz); /** - * __register_sysctl_paths - register a sysctl table hierarchy - * @set: Sysctl tree to register on - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure + * __register_sysctl_init() - register sysctl table to path + * @path: path name for sysctl base. If that path doesn't exist we will create + * it for you. + * @table: This is the sysctl table that needs to be registered to the path. + * The caller must ensure the life of the @table will be kept during the + * lifetime use of the sysctl. + * @table_name: The name of sysctl table, only used for log printing when + * registration fails + * @table_size: The number of elements in table * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. + * The sysctl interface is used by userspace to query or modify at runtime + * a predefined value set on a variable. These variables however have default + * values pre-set. Code which depends on these variables will always work even + * if register_sysctl() fails. If register_sysctl() fails you'd just loose the + * ability to query or modify the sysctls dynamically at run time. Chances of + * register_sysctl() failing on init are extremely low, and so for both reasons + * this function does not return any error as it is used by initialization code. * - * See __register_sysctl_table for more details. + * Context: if your base directory does not exist it will be created for you. */ -struct ctl_table_header *__register_sysctl_paths( - struct ctl_table_set *set, - const struct ctl_path *path, struct ctl_table *table) +void __init __register_sysctl_init(const char *path, const struct ctl_table *table, + const char *table_name, size_t table_size) { - struct ctl_table *ctl_table_arg = table; - int nr_subheaders = count_subheaders(table); - struct ctl_table_header *header = NULL, **subheaders, **subheader; - const struct ctl_path *component; - char *new_path, *pos; - - pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL); - if (!new_path) - return NULL; + struct ctl_table_header *hdr = register_sysctl_sz(path, table, table_size); - pos[0] = '\0'; - for (component = path; component->procname; component++) { - pos = append_path(new_path, pos, component->procname); - if (!pos) - goto out; - } - while (table->procname && table->child && !table[1].procname) { - pos = append_path(new_path, pos, table->procname); - if (!pos) - goto out; - table = table->child; - } - if (nr_subheaders == 1) { - header = __register_sysctl_table(set, new_path, table); - if (header) - header->ctl_table_arg = ctl_table_arg; - } else { - header = kzalloc(sizeof(*header) + - sizeof(*subheaders)*nr_subheaders, GFP_KERNEL); - if (!header) - goto out; - - subheaders = (struct ctl_table_header **) (header + 1); - subheader = subheaders; - header->ctl_table_arg = ctl_table_arg; - - if (register_leaf_sysctl_tables(new_path, pos, &subheader, - set, table)) - goto err_register_leaves; - } - -out: - kfree(new_path); - return header; - -err_register_leaves: - while (subheader > subheaders) { - struct ctl_table_header *subh = *(--subheader); - struct ctl_table *table = subh->ctl_table_arg; - unregister_sysctl_table(subh); - kfree(table); + if (unlikely(!hdr)) { + pr_err("failed when register_sysctl_sz %s to %s\n", table_name, path); + return; } - kfree(header); - header = NULL; - goto out; -} - -/** - * register_sysctl_table_path - register a sysctl table hierarchy - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See __register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, - struct ctl_table *table) -{ - return __register_sysctl_paths(&sysctl_table_root.default_set, - path, table); -} -EXPORT_SYMBOL(register_sysctl_paths); - -/** - * register_sysctl_table - register a sysctl table hierarchy - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_table(struct ctl_table *table) -{ - static const struct ctl_path null_path[] = { {} }; - - return register_sysctl_paths(null_path, table); + kmemleak_not_leak(hdr); } -EXPORT_SYMBOL(register_sysctl_table); static void put_links(struct ctl_table_header *header) { @@ -1582,7 +1477,7 @@ static void put_links(struct ctl_table_header *header) struct ctl_table_root *root = header->root; struct ctl_dir *parent = header->parent; struct ctl_dir *core_parent; - struct ctl_table *entry; + const struct ctl_table *entry; if (header->set == root_set) return; @@ -1591,9 +1486,9 @@ static void put_links(struct ctl_table_header *header) if (IS_ERR(core_parent)) return; - for (entry = header->ctl_table; entry->procname; entry++) { + list_for_each_table_entry(entry, header) { struct ctl_table_header *link_head; - struct ctl_table *link; + const struct ctl_table *link; const char *name = entry->procname; link = find_entry(&link_head, core_parent, name, strlen(name)); @@ -1605,7 +1500,7 @@ static void put_links(struct ctl_table_header *header) else { pr_err("sysctl link missing during unregister: "); sysctl_print_dir(parent); - pr_cont("/%s\n", name); + pr_cont("%s\n", name); } } } @@ -1617,8 +1512,11 @@ static void drop_sysctl_table(struct ctl_table_header *header) if (--header->nreg) return; - put_links(header); - start_unregistering(header); + if (parent) { + put_links(header); + start_unregistering(header); + } + if (!--header->count) kfree_rcu(header, rcu); @@ -1628,35 +1526,18 @@ static void drop_sysctl_table(struct ctl_table_header *header) /** * unregister_sysctl_table - unregister a sysctl table hierarchy - * @header: the header returned from register_sysctl_table + * @header: the header returned from register_sysctl or __register_sysctl_table * * Unregisters the sysctl table and all children. proc entries may not * actually be removed until they are no longer used by anyone. */ void unregister_sysctl_table(struct ctl_table_header * header) { - int nr_subheaders; might_sleep(); if (header == NULL) return; - nr_subheaders = count_subheaders(header->ctl_table_arg); - if (unlikely(nr_subheaders > 1)) { - struct ctl_table_header **subheaders; - int i; - - subheaders = (struct ctl_table_header **)(header + 1); - for (i = nr_subheaders -1; i >= 0; i--) { - struct ctl_table_header *subh = subheaders[i]; - struct ctl_table *table = subh->ctl_table_arg; - unregister_sysctl_table(subh); - kfree(table); - } - kfree(header); - return; - } - spin_lock(&sysctl_lock); drop_sysctl_table(header); spin_unlock(&sysctl_lock); @@ -1669,7 +1550,7 @@ void setup_sysctl_set(struct ctl_table_set *set, { memset(set, 0, sizeof(*set)); set->is_seen = is_seen; - init_header(&set->dir.header, root, set, NULL, root_table); + init_header(&set->dir.header, root, set, NULL, root_table, 1); } void retire_sysctl_set(struct ctl_table_set *set) @@ -1683,8 +1564,163 @@ int __init proc_sys_init(void) proc_sys_root = proc_mkdir("sys", NULL); proc_sys_root->proc_iops = &proc_sys_dir_operations; - proc_sys_root->proc_fops = &proc_sys_dir_file_operations; + proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; - return sysctl_init(); + return sysctl_init_bases(); +} + +struct sysctl_alias { + const char *kernel_param; + const char *sysctl_param; +}; + +/* + * Historically some settings had both sysctl and a command line parameter. + * With the generic sysctl. parameter support, we can handle them at a single + * place and only keep the historical name for compatibility. This is not meant + * to add brand new aliases. When adding existing aliases, consider whether + * the possibly different moment of changing the value (e.g. from early_param + * to the moment do_sysctl_args() is called) is an issue for the specific + * parameter. + */ +static const struct sysctl_alias sysctl_aliases[] = { + {"hardlockup_all_cpu_backtrace", "kernel.hardlockup_all_cpu_backtrace" }, + {"hung_task_panic", "kernel.hung_task_panic" }, + {"numa_zonelist_order", "vm.numa_zonelist_order" }, + {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" }, + { } +}; + +static const char *sysctl_find_alias(char *param) +{ + const struct sysctl_alias *alias; + + for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) { + if (strcmp(alias->kernel_param, param) == 0) + return alias->sysctl_param; + } + + return NULL; +} + +bool sysctl_is_alias(char *param) +{ + const char *alias = sysctl_find_alias(param); + + return alias != NULL; +} + +/* Set sysctl value passed on kernel command line. */ +static int process_sysctl_arg(char *param, char *val, + const char *unused, void *arg) +{ + char *path; + struct vfsmount **proc_mnt = arg; + struct file_system_type *proc_fs_type; + struct file *file; + int len; + int err; + loff_t pos = 0; + ssize_t wret; + + if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { + param += sizeof("sysctl") - 1; + + if (param[0] != '/' && param[0] != '.') + return 0; + + param++; + } else { + param = (char *) sysctl_find_alias(param); + if (!param) + return 0; + } + + if (!val) + return -EINVAL; + len = strlen(val); + if (len == 0) + return -EINVAL; + + /* + * To set sysctl options, we use a temporary mount of proc, look up the + * respective sys/ file and write to it. To avoid mounting it when no + * options were given, we mount it only when the first sysctl option is + * found. Why not a persistent mount? There are problems with a + * persistent mount of proc in that it forces userspace not to use any + * proc mount options. + */ + if (!*proc_mnt) { + proc_fs_type = get_fs_type("proc"); + if (!proc_fs_type) { + pr_err("Failed to find procfs to set sysctl from command line\n"); + return 0; + } + *proc_mnt = kern_mount(proc_fs_type); + put_filesystem(proc_fs_type); + if (IS_ERR(*proc_mnt)) { + pr_err("Failed to mount procfs to set sysctl from command line\n"); + return 0; + } + } + + path = kasprintf(GFP_KERNEL, "sys/%s", param); + if (!path) + panic("%s: Failed to allocate path for %s\n", __func__, param); + strreplace(path, '.', '/'); + + file = file_open_root_mnt(*proc_mnt, path, O_WRONLY, 0); + if (IS_ERR(file)) { + err = PTR_ERR(file); + if (err == -ENOENT) + pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n", + param, val); + else if (err == -EACCES) + pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n", + param, val); + else + pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n", + file, param, val); + goto out; + } + wret = kernel_write(file, val, len, &pos); + if (wret < 0) { + err = wret; + if (err == -EINVAL) + pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n", + param, val); + else + pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n", + ERR_PTR(err), param, val); + } else if (wret != len) { + pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n", + wret, len, path, param, val); + } + + err = filp_close(file, NULL); + if (err) + pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n", + ERR_PTR(err), param, val); +out: + kfree(path); + return 0; +} + +void do_sysctl_args(void) +{ + char *command_line; + struct vfsmount *proc_mnt = NULL; + + command_line = kstrdup(saved_command_line, GFP_KERNEL); + if (!command_line) + panic("%s: Failed to allocate copy of command line\n", __func__); + + parse_args("Setting sysctl args", command_line, + NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg); + + if (proc_mnt) + kern_unmount(proc_mnt); + + kfree(command_line); } diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index 901bd06f437d..5c6a5ceab2f1 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -1,10 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * proc_tty.c -- handles /proc/tty * * Copyright 1997, Theodore Ts'o */ - -#include <linux/uaccess.h> #include <linux/module.h> #include <linux/init.h> #include <linux/errno.h> @@ -14,6 +13,7 @@ #include <linux/tty.h> #include <linux/seq_file.h> #include <linux/bitops.h> +#include "internal.h" /* * The /proc/tty directory inodes... @@ -124,18 +124,6 @@ static const struct seq_operations tty_drivers_op = { .show = show_tty_driver }; -static int tty_drivers_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &tty_drivers_op); -} - -static const struct file_operations proc_tty_drivers_operations = { - .open = tty_drivers_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - /* * This function is called by tty_register_driver() to handle * registering the driver's /proc handler into /proc/tty/driver/<foo> @@ -145,11 +133,11 @@ void proc_tty_register_driver(struct tty_driver *driver) struct proc_dir_entry *ent; if (!driver->driver_name || driver->proc_entry || - !driver->ops->proc_fops) + !driver->ops->proc_show) return; - ent = proc_create_data(driver->driver_name, 0, proc_tty_driver, - driver->ops->proc_fops, driver); + ent = proc_create_single_data(driver->driver_name, 0, proc_tty_driver, + driver->ops->proc_show, driver); driver->proc_entry = ent; } @@ -164,7 +152,7 @@ void proc_tty_unregister_driver(struct tty_driver *driver) if (!ent) return; - remove_proc_entry(driver->driver_name, proc_tty_driver); + remove_proc_entry(ent->name, proc_tty_driver); driver->proc_entry = NULL; } @@ -184,6 +172,6 @@ void __init proc_tty_init(void) * entry. */ proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL); - proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops); - proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations); + proc_create_seq("tty/ldiscs", 0, NULL, &tty_ldiscs_seq_ops); + proc_create_seq("tty/drivers", 0, NULL, &tty_drivers_op); } diff --git a/fs/proc/root.c b/fs/proc/root.c index deecb397daa3..d8ca41d823e4 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/root.c * @@ -5,9 +6,6 @@ * * proc root directory handling functions */ - -#include <linux/uaccess.h> - #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> @@ -18,127 +16,361 @@ #include <linux/module.h> #include <linux/bitops.h> #include <linux/user_namespace.h> +#include <linux/fs_context.h> #include <linux/mount.h> #include <linux/pid_namespace.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> #include <linux/cred.h> +#include <linux/magic.h> +#include <linux/slab.h> #include "internal.h" -enum { - Opt_gid, Opt_hidepid, Opt_err, +struct proc_fs_context { + struct pid_namespace *pid_ns; + unsigned int mask; + enum proc_hidepid hidepid; + int gid; + enum proc_pidonly pidonly; }; -static const match_table_t tokens = { - {Opt_hidepid, "hidepid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_err, NULL}, +enum proc_param { + Opt_gid, + Opt_hidepid, + Opt_subset, + Opt_pidns, }; -int proc_parse_options(char *options, struct pid_namespace *pid) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; - - args[0].to = args[0].from = NULL; - token = match_token(p, tokens, args); - switch (token) { - case Opt_gid: - if (match_int(&args[0], &option)) - return 0; - pid->pid_gid = make_kgid(current_user_ns(), option); - break; - case Opt_hidepid: - if (match_int(&args[0], &option)) - return 0; - if (option < HIDEPID_OFF || - option > HIDEPID_INVISIBLE) { - pr_err("proc: hidepid value must be between 0 and 2.\n"); - return 0; +static const struct fs_parameter_spec proc_fs_parameters[] = { + fsparam_u32("gid", Opt_gid), + fsparam_string("hidepid", Opt_hidepid), + fsparam_string("subset", Opt_subset), + fsparam_file_or_string("pidns", Opt_pidns), + {} +}; + +static inline int valid_hidepid(unsigned int value) +{ + return (value == HIDEPID_OFF || + value == HIDEPID_NO_ACCESS || + value == HIDEPID_INVISIBLE || + value == HIDEPID_NOT_PTRACEABLE); +} + +static int proc_parse_hidepid_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct proc_fs_context *ctx = fc->fs_private; + struct fs_parameter_spec hidepid_u32_spec = fsparam_u32("hidepid", Opt_hidepid); + struct fs_parse_result result; + int base = (unsigned long)hidepid_u32_spec.data; + + if (param->type != fs_value_is_string) + return invalf(fc, "proc: unexpected type of hidepid value\n"); + + if (!kstrtouint(param->string, base, &result.uint_32)) { + if (!valid_hidepid(result.uint_32)) + return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string); + ctx->hidepid = result.uint_32; + return 0; + } + + if (!strcmp(param->string, "off")) + ctx->hidepid = HIDEPID_OFF; + else if (!strcmp(param->string, "noaccess")) + ctx->hidepid = HIDEPID_NO_ACCESS; + else if (!strcmp(param->string, "invisible")) + ctx->hidepid = HIDEPID_INVISIBLE; + else if (!strcmp(param->string, "ptraceable")) + ctx->hidepid = HIDEPID_NOT_PTRACEABLE; + else + return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string); + + return 0; +} + +static int proc_parse_subset_param(struct fs_context *fc, char *value) +{ + struct proc_fs_context *ctx = fc->fs_private; + + while (value) { + char *ptr = strchr(value, ','); + + if (ptr != NULL) + *ptr++ = '\0'; + + if (*value != '\0') { + if (!strcmp(value, "pid")) { + ctx->pidonly = PROC_PIDONLY_ON; + } else { + return invalf(fc, "proc: unsupported subset option - %s\n", value); } - pid->hide_pid = option; - break; - default: - pr_err("proc: unrecognized mount option \"%s\" " - "or missing value\n", p); - return 0; } + value = ptr; } - return 1; + return 0; } -int proc_remount(struct super_block *sb, int *flags, char *data) +#ifdef CONFIG_PID_NS +static int proc_parse_pidns_param(struct fs_context *fc, + struct fs_parameter *param, + struct fs_parse_result *result) { - struct pid_namespace *pid = sb->s_fs_info; + struct proc_fs_context *ctx = fc->fs_private; + struct pid_namespace *target, *active = task_active_pid_ns(current); + struct ns_common *ns; + struct file *ns_filp __free(fput) = NULL; + + switch (param->type) { + case fs_value_is_file: + /* came through fsconfig, steal the file reference */ + ns_filp = no_free_ptr(param->file); + break; + case fs_value_is_string: + ns_filp = filp_open(param->string, O_RDONLY, 0); + break; + default: + WARN_ON_ONCE(true); + break; + } + if (!ns_filp) + ns_filp = ERR_PTR(-EBADF); + if (IS_ERR(ns_filp)) { + errorfc(fc, "could not get file from pidns argument"); + return PTR_ERR(ns_filp); + } - sync_filesystem(sb); - return !proc_parse_options(data, pid); + if (!proc_ns_file(ns_filp)) + return invalfc(fc, "pidns argument is not an nsfs file"); + ns = get_proc_ns(file_inode(ns_filp)); + if (ns->ns_type != CLONE_NEWPID) + return invalfc(fc, "pidns argument is not a pidns file"); + target = container_of(ns, struct pid_namespace, ns); + + /* + * pidns= is shorthand for joining the pidns to get a fsopen fd, so the + * permission model should be the same as pidns_install(). + */ + if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) { + errorfc(fc, "insufficient permissions to set pidns"); + return -EPERM; + } + if (!pidns_is_ancestor(target, active)) + return invalfc(fc, "cannot set pidns to non-descendant pidns"); + + put_pid_ns(ctx->pid_ns); + ctx->pid_ns = get_pid_ns(target); + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); + return 0; } +#endif /* CONFIG_PID_NS */ -static struct dentry *proc_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct pid_namespace *ns; + struct proc_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt, err; + + opt = fs_parse(fc, proc_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_gid: + ctx->gid = result.uint_32; + break; + + case Opt_hidepid: + err = proc_parse_hidepid_param(fc, param); + if (err) + return err; + break; + + case Opt_subset: + err = proc_parse_subset_param(fc, param->string); + if (err) + return err; + break; + + case Opt_pidns: +#ifdef CONFIG_PID_NS + /* + * We would have to RCU-protect every proc_pid_ns() or + * proc_sb_info() access if we allowed this to be reconfigured + * for an existing procfs instance. Luckily, procfs instances + * are cheap to create, and mount-beneath would let you + * atomically replace an instance even with overmounts. + */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + errorfc(fc, "cannot reconfigure pidns for existing procfs"); + return -EBUSY; + } + err = proc_parse_pidns_param(fc, param, &result); + if (err) + return err; + break; +#else + errorfc(fc, "pidns mount flag not supported on this system"); + return -EOPNOTSUPP; +#endif + + default: + return -EINVAL; + } + + ctx->mask |= 1 << opt; + return 0; +} + +static void proc_apply_options(struct proc_fs_info *fs_info, + struct fs_context *fc, + struct user_namespace *user_ns) +{ + struct proc_fs_context *ctx = fc->fs_private; + + if (ctx->mask & (1 << Opt_gid)) + fs_info->pid_gid = make_kgid(user_ns, ctx->gid); + if (ctx->mask & (1 << Opt_hidepid)) + fs_info->hide_pid = ctx->hidepid; + if (ctx->mask & (1 << Opt_subset)) + fs_info->pidonly = ctx->pidonly; + if (ctx->mask & (1 << Opt_pidns) && + !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) { + put_pid_ns(fs_info->pid_ns); + fs_info->pid_ns = get_pid_ns(ctx->pid_ns); + } +} + +static int proc_fill_super(struct super_block *s, struct fs_context *fc) +{ + struct proc_fs_context *ctx = fc->fs_private; + struct inode *root_inode; + struct proc_fs_info *fs_info; + int ret; + + fs_info = kzalloc(sizeof(*fs_info), GFP_KERNEL); + if (!fs_info) + return -ENOMEM; + + fs_info->pid_ns = get_pid_ns(ctx->pid_ns); + proc_apply_options(fs_info, fc, current_user_ns()); + + /* User space would break if executables or devices appear on proc */ + s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; + s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = PROC_SUPER_MAGIC; + s->s_op = &proc_sops; + s->s_time_gran = 1; + s->s_fs_info = fs_info; + + /* + * procfs isn't actually a stacking filesystem; however, there is + * too much magic going on inside it to permit stacking things on + * top of it + */ + s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + + /* procfs dentries and inodes don't require IO to create */ + s->s_shrink->seeks = 0; + + pde_get(&proc_root); + root_inode = proc_get_inode(s, &proc_root); + if (!root_inode) { + pr_err("proc_fill_super: get root inode failed\n"); + return -ENOMEM; + } - if (flags & MS_KERNMOUNT) { - ns = data; - data = NULL; - } else { - ns = task_active_pid_ns(current); + s->s_root = d_make_root(root_inode); + if (!s->s_root) { + pr_err("proc_fill_super: allocate dentry failed\n"); + return -ENOMEM; } - return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super); + ret = proc_setup_self(s); + if (ret) { + return ret; + } + return proc_setup_thread_self(s); +} + +static int proc_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + struct proc_fs_info *fs_info = proc_sb_info(sb); + + sync_filesystem(sb); + + proc_apply_options(fs_info, fc, current_user_ns()); + return 0; +} + +static int proc_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, proc_fill_super); +} + +static void proc_fs_context_free(struct fs_context *fc) +{ + struct proc_fs_context *ctx = fc->fs_private; + + put_pid_ns(ctx->pid_ns); + kfree(ctx); +} + +static const struct fs_context_operations proc_fs_context_ops = { + .free = proc_fs_context_free, + .parse_param = proc_parse_param, + .get_tree = proc_get_tree, + .reconfigure = proc_reconfigure, +}; + +static int proc_init_fs_context(struct fs_context *fc) +{ + struct proc_fs_context *ctx; + + ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->pid_ns = get_pid_ns(task_active_pid_ns(current)); + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); + fc->fs_private = ctx; + fc->ops = &proc_fs_context_ops; + return 0; } static void proc_kill_sb(struct super_block *sb) { - struct pid_namespace *ns; + struct proc_fs_info *fs_info = proc_sb_info(sb); - ns = (struct pid_namespace *)sb->s_fs_info; - if (ns->proc_self) - dput(ns->proc_self); - if (ns->proc_thread_self) - dput(ns->proc_thread_self); kill_anon_super(sb); - put_pid_ns(ns); + if (fs_info) { + put_pid_ns(fs_info->pid_ns); + kfree_rcu(fs_info, rcu); + } } static struct file_system_type proc_fs_type = { - .name = "proc", - .mount = proc_mount, - .kill_sb = proc_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .name = "proc", + .init_fs_context = proc_init_fs_context, + .parameters = proc_fs_parameters, + .kill_sb = proc_kill_sb, + .fs_flags = FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM, }; void __init proc_root_init(void) { - int err; - - proc_init_inodecache(); + proc_init_kmemcache(); set_proc_pid_nlink(); - err = register_filesystem(&proc_fs_type); - if (err) - return; - proc_self_init(); proc_thread_self_init(); proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); - -#ifdef CONFIG_SYSVIPC - proc_mkdir("sysvipc", NULL); -#endif proc_mkdir("fs", NULL); proc_mkdir("driver", NULL); proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */ @@ -149,21 +381,30 @@ void __init proc_root_init(void) proc_tty_init(); proc_mkdir("bus", NULL); proc_sys_init(); + + /* + * Last things last. It is not like userspace processes eager + * to open /proc files exist at this point but register last + * anyway. + */ + register_filesystem(&proc_fs_type); } -static int proc_root_getattr(const struct path *path, struct kstat *stat, +static int proc_root_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - generic_fillattr(d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), + stat); stat->nlink = proc_root.nlink + nr_processes(); return 0; } static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) { - if (!proc_pid_lookup(dir, dentry, flags)) + if (!proc_pid_lookup(dentry, flags)) return NULL; - + return proc_lookup(dir, dentry, flags); } @@ -202,31 +443,14 @@ static const struct inode_operations proc_root_inode_operations = { * This is the root "inode" in the /proc tree.. */ struct proc_dir_entry proc_root = { - .low_ino = PROC_ROOT_INO, - .namelen = 5, - .mode = S_IFDIR | S_IRUGO | S_IXUGO, - .nlink = 2, - .count = ATOMIC_INIT(1), - .proc_iops = &proc_root_inode_operations, - .proc_fops = &proc_root_operations, + .low_ino = PROCFS_ROOT_INO, + .namelen = 5, + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .nlink = 2, + .refcnt = REFCOUNT_INIT(1), + .proc_iops = &proc_root_inode_operations, + .proc_dir_ops = &proc_root_operations, .parent = &proc_root, .subdir = RB_ROOT, .name = "/proc", }; - -int pid_ns_prepare_proc(struct pid_namespace *ns) -{ - struct vfsmount *mnt; - - mnt = kern_mount_data(&proc_fs_type, ns); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); - - ns->proc_mnt = mnt; - return 0; -} - -void pid_ns_release_proc(struct pid_namespace *ns) -{ - kern_unmount(ns->proc_mnt); -} diff --git a/fs/proc/self.c b/fs/proc/self.c index 39857f6db5cf..62d2c0cfe35c 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cache.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/pid_namespace.h> @@ -10,17 +12,17 @@ static const char *proc_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct pid_namespace *ns = inode->i_sb->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); pid_t tgid = task_tgid_nr_ns(current, ns); char *name; if (!tgid) return ERR_PTR(-ENOENT); - /* 11 for max length of signed int in decimal + NULL term */ - name = kmalloc(12, dentry ? GFP_KERNEL : GFP_ATOMIC); + /* max length of unsigned int in decimal + NULL term */ + name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); if (unlikely(!name)) return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); - sprintf(name, "%d", tgid); + sprintf(name, "%u", tgid); set_delayed_call(done, kfree_link, name); return name; } @@ -29,40 +31,36 @@ static const struct inode_operations proc_self_inode_operations = { .get_link = proc_self_get_link, }; -static unsigned self_inum; +unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct pid_namespace *ns = s->s_fs_info; struct dentry *self; - + int ret = -ENOMEM; + inode_lock(root_inode); self = d_alloc_name(s->s_root, "self"); if (self) { - struct inode *inode = new_inode_pseudo(s); + struct inode *inode = new_inode(s); if (inode) { inode->i_ino = self_inum; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_self_inode_operations; - d_add(self, inode); - } else { - dput(self); - self = ERR_PTR(-ENOMEM); + d_make_persistent(self, inode); + ret = 0; } - } else { - self = ERR_PTR(-ENOMEM); + dput(self); } inode_unlock(root_inode); - if (IS_ERR(self)) { + + if (ret) pr_err("proc_fill_super: can't allocate /proc/self\n"); - return PTR_ERR(self); - } - ns->proc_self = self; - return 0; + + return ret; } void __init proc_self_init(void) diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index ad8a77f94beb..04bb29721419 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -1,7 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> #include <linux/kernel_stat.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include "internal.h" /* * /proc/softirqs ... display the number of softirqs @@ -18,27 +20,18 @@ static int show_softirqs(struct seq_file *p, void *v) for (i = 0; i < NR_SOFTIRQS; i++) { seq_printf(p, "%12s:", softirq_to_name[i]); for_each_possible_cpu(j) - seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); + seq_put_decimal_ull_width(p, " ", kstat_softirqs_cpu(i, j), 10); seq_putc(p, '\n'); } return 0; } -static int softirqs_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_softirqs, NULL); -} - -static const struct file_operations proc_softirqs_operations = { - .open = softirqs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_softirqs_init(void) { - proc_create("softirqs", 0, NULL, &proc_softirqs_operations); + struct proc_dir_entry *pde; + + pde = proc_create_single("softirqs", 0, NULL, show_softirqs); + pde_make_permanent(pde); return 0; } fs_initcall(proc_softirqs_init); diff --git a/fs/proc/stat.c b/fs/proc/stat.c index bd4e55f4aa20..8b444e862319 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/cpumask.h> #include <linux/fs.h> #include <linux/init.h> @@ -9,6 +10,7 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/time.h> +#include <linux/time_namespace.h> #include <linux/irqnr.h> #include <linux/sched/cputime.h> #include <linux/tick.h> @@ -20,31 +22,7 @@ #define arch_irq_stat() 0 #endif -#ifdef arch_idle_time - -static u64 get_idle_time(int cpu) -{ - u64 idle; - - idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; - if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) - idle += arch_idle_time(cpu); - return idle; -} - -static u64 get_iowait_time(int cpu) -{ - u64 iowait; - - iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; - if (cpu_online(cpu) && nr_iowait_cpu(cpu)) - iowait += arch_idle_time(cpu); - return iowait; -} - -#else - -static u64 get_idle_time(int cpu) +u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle, idle_usecs = -1ULL; @@ -53,14 +31,14 @@ static u64 get_idle_time(int cpu) if (idle_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ - idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + idle = kcs->cpustat[CPUTIME_IDLE]; else idle = idle_usecs * NSEC_PER_USEC; return idle; } -static u64 get_iowait_time(int cpu) +static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) { u64 iowait, iowait_usecs = -1ULL; @@ -69,14 +47,37 @@ static u64 get_iowait_time(int cpu) if (iowait_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ - iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + iowait = kcs->cpustat[CPUTIME_IOWAIT]; else iowait = iowait_usecs * NSEC_PER_USEC; return iowait; } -#endif +static void show_irq_gap(struct seq_file *p, unsigned int gap) +{ + static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"; + + while (gap > 0) { + unsigned int inc; + + inc = min_t(unsigned int, gap, ARRAY_SIZE(zeros) / 2); + seq_write(p, zeros, 2 * inc); + gap -= inc; + } +} + +static void show_all_irqs(struct seq_file *p) +{ + unsigned int i, next = 0; + + for_each_active_irq(i) { + show_irq_gap(p, i - next); + seq_put_decimal_ull(p, " ", kstat_irqs_usr(i)); + next = i + 1; + } + show_irq_gap(p, irq_get_nr_irqs() - next); +} static int show_stat(struct seq_file *p, void *v) { @@ -92,20 +93,27 @@ static int show_stat(struct seq_file *p, void *v) irq = softirq = steal = 0; guest = guest_nice = 0; getboottime64(&boottime); + /* shift boot timestamp according to the timens offset */ + timens_sub_boottime(&boottime); for_each_possible_cpu(i) { - user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; - nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; - system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; - idle += get_idle_time(i); - iowait += get_iowait_time(i); - irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; - softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; - steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; - guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; - guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; - sum += kstat_cpu_irqs_sum(i); - sum += arch_irq_stat_cpu(i); + struct kernel_cpustat kcpustat; + u64 *cpustat = kcpustat.cpustat; + + kcpustat_cpu_fetch(&kcpustat, i); + + user += cpustat[CPUTIME_USER]; + nice += cpustat[CPUTIME_NICE]; + system += cpustat[CPUTIME_SYSTEM]; + idle += get_idle_time(&kcpustat, i); + iowait += get_iowait_time(&kcpustat, i); + irq += cpustat[CPUTIME_IRQ]; + softirq += cpustat[CPUTIME_SOFTIRQ]; + steal += cpustat[CPUTIME_STEAL]; + guest += cpustat[CPUTIME_GUEST]; + guest_nice += cpustat[CPUTIME_GUEST_NICE]; + sum += kstat_cpu_irqs_sum(i); + sum += arch_irq_stat_cpu(i); for (j = 0; j < NR_SOFTIRQS; j++) { unsigned int softirq_stat = kstat_softirqs_cpu(j, i); @@ -129,17 +137,22 @@ static int show_stat(struct seq_file *p, void *v) seq_putc(p, '\n'); for_each_online_cpu(i) { + struct kernel_cpustat kcpustat; + u64 *cpustat = kcpustat.cpustat; + + kcpustat_cpu_fetch(&kcpustat, i); + /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; - nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; - system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; - idle = get_idle_time(i); - iowait = get_iowait_time(i); - irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; - softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; - steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; - guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; - guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + user = cpustat[CPUTIME_USER]; + nice = cpustat[CPUTIME_NICE]; + system = cpustat[CPUTIME_SYSTEM]; + idle = get_idle_time(&kcpustat, i); + iowait = get_iowait_time(&kcpustat, i); + irq = cpustat[CPUTIME_IRQ]; + softirq = cpustat[CPUTIME_SOFTIRQ]; + steal = cpustat[CPUTIME_STEAL]; + guest = cpustat[CPUTIME_GUEST]; + guest_nice = cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d", i); seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); @@ -155,16 +168,14 @@ static int show_stat(struct seq_file *p, void *v) } seq_put_decimal_ull(p, "intr ", (unsigned long long)sum); - /* sum again ? it could be updated? */ - for_each_irq_nr(j) - seq_put_decimal_ull(p, " ", kstat_irqs_usr(j)); + show_all_irqs(p); seq_printf(p, "\nctxt %llu\n" "btime %llu\n" "processes %lu\n" - "procs_running %lu\n" - "procs_blocked %lu\n", + "procs_running %u\n" + "procs_blocked %u\n", nr_context_switches(), (unsigned long long)boottime.tv_sec, total_forks, @@ -182,23 +193,24 @@ static int show_stat(struct seq_file *p, void *v) static int stat_open(struct inode *inode, struct file *file) { - size_t size = 1024 + 128 * num_online_cpus(); + unsigned int size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ - size += 2 * nr_irqs; + size += 2 * irq_get_nr_irqs(); return single_open_size(file, show_stat, NULL, size); } -static const struct file_operations proc_stat_operations = { - .open = stat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +static const struct proc_ops stat_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_open = stat_open, + .proc_read_iter = seq_read_iter, + .proc_lseek = seq_lseek, + .proc_release = single_release, }; static int __init proc_stat_init(void) { - proc_create("stat", 0, NULL, &proc_stat_operations); + proc_create("stat", 0, NULL, &stat_proc_ops); return 0; } fs_initcall(proc_stat_init); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b836fd61ed87..81dfc26bfae8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,8 +1,10 @@ -#include <linux/mm.h> -#include <linux/vmacache.h> +// SPDX-License-Identifier: GPL-2.0 +#include <linux/pagewalk.h> +#include <linux/mm_inline.h> #include <linux/hugetlb.h> #include <linux/huge_mm.h> #include <linux/mount.h> +#include <linux/ksm.h> #include <linux/seq_file.h> #include <linux/highmem.h> #include <linux/ptrace.h> @@ -12,24 +14,34 @@ #include <linux/rmap.h> #include <linux/swap.h> #include <linux/sched/mm.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> #include <linux/shmem_fs.h> +#include <linux/uaccess.h> +#include <linux/pkeys.h> +#include <linux/minmax.h> +#include <linux/overflow.h> +#include <linux/buildid.h> #include <asm/elf.h> -#include <linux/uaccess.h> +#include <asm/tlb.h> #include <asm/tlbflush.h> #include "internal.h" +#define SENTINEL_VMA_END -1 +#define SENTINEL_VMA_GATE -2 + +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long text, lib, swap, ptes, pmds, anon, file, shmem; + unsigned long text, lib, swap, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; - anon = get_mm_counter(mm, MM_ANONPAGES); - file = get_mm_counter(mm, MM_FILEPAGES); - shmem = get_mm_counter(mm, MM_SHMEMPAGES); + anon = get_mm_counter_sum(mm, MM_ANONPAGES); + file = get_mm_counter_sum(mm, MM_FILEPAGES); + shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES); /* * Note: to minimize their overhead, mm maintains hiwater_vm and @@ -45,44 +57,34 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) if (hiwater_rss < mm->hiwater_rss) hiwater_rss = mm->hiwater_rss; - text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; - lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; - swap = get_mm_counter(mm, MM_SWAPENTS); - ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes); - pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); - seq_printf(m, - "VmPeak:\t%8lu kB\n" - "VmSize:\t%8lu kB\n" - "VmLck:\t%8lu kB\n" - "VmPin:\t%8lu kB\n" - "VmHWM:\t%8lu kB\n" - "VmRSS:\t%8lu kB\n" - "RssAnon:\t%8lu kB\n" - "RssFile:\t%8lu kB\n" - "RssShmem:\t%8lu kB\n" - "VmData:\t%8lu kB\n" - "VmStk:\t%8lu kB\n" - "VmExe:\t%8lu kB\n" - "VmLib:\t%8lu kB\n" - "VmPTE:\t%8lu kB\n" - "VmPMD:\t%8lu kB\n" - "VmSwap:\t%8lu kB\n", - hiwater_vm << (PAGE_SHIFT-10), - total_vm << (PAGE_SHIFT-10), - mm->locked_vm << (PAGE_SHIFT-10), - mm->pinned_vm << (PAGE_SHIFT-10), - hiwater_rss << (PAGE_SHIFT-10), - total_rss << (PAGE_SHIFT-10), - anon << (PAGE_SHIFT-10), - file << (PAGE_SHIFT-10), - shmem << (PAGE_SHIFT-10), - mm->data_vm << (PAGE_SHIFT-10), - mm->stack_vm << (PAGE_SHIFT-10), text, lib, - ptes >> 10, - pmds >> 10, - swap << (PAGE_SHIFT-10)); + /* split executable areas between text and lib */ + text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK); + text = min(text, mm->exec_vm << PAGE_SHIFT); + lib = (mm->exec_vm << PAGE_SHIFT) - text; + + swap = get_mm_counter_sum(mm, MM_SWAPENTS); + SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); + SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); + SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); + SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm)); + SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); + SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); + SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); + SEQ_PUT_DEC(" kB\nRssFile:\t", file); + SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem); + SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm); + SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm); + seq_put_decimal_ull_width(m, + " kB\nVmExe:\t", text >> 10, 8); + seq_put_decimal_ull_width(m, + " kB\nVmLib:\t", lib >> 10, 8); + seq_put_decimal_ull_width(m, + " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8); + SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); + seq_puts(m, " kB\n"); hugetlb_report_usage(m, mm); } +#undef SEQ_PUT_DEC unsigned long task_vsize(struct mm_struct *mm) { @@ -93,12 +95,12 @@ unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { - *shared = get_mm_counter(mm, MM_FILEPAGES) + - get_mm_counter(mm, MM_SHMEMPAGES); + *shared = get_mm_counter_sum(mm, MM_FILEPAGES) + + get_mm_counter_sum(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->data_vm + mm->stack_vm; - *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); + *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES); return mm->total_vm; } @@ -128,100 +130,214 @@ static void release_task_mempolicy(struct proc_maps_private *priv) } #endif -static void vma_stop(struct proc_maps_private *priv) +#ifdef CONFIG_PER_VMA_LOCK + +static void reset_lock_ctx(struct proc_maps_locking_ctx *lock_ctx) { - struct mm_struct *mm = priv->mm; + lock_ctx->locked_vma = NULL; + lock_ctx->mmap_locked = false; +} - release_task_mempolicy(priv); - up_read(&mm->mmap_sem); - mmput(mm); +static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx) +{ + if (lock_ctx->locked_vma) { + vma_end_read(lock_ctx->locked_vma); + lock_ctx->locked_vma = NULL; + } } -static struct vm_area_struct * -m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma) +static const struct seq_operations proc_pid_maps_op; + +static inline bool lock_vma_range(struct seq_file *m, + struct proc_maps_locking_ctx *lock_ctx) { - if (vma == priv->tail_vma) - return NULL; - return vma->vm_next ?: priv->tail_vma; + /* + * smaps and numa_maps perform page table walk, therefore require + * mmap_lock but maps can be read with locking just the vma and + * walking the vma tree under rcu read protection. + */ + if (m->op != &proc_pid_maps_op) { + if (mmap_read_lock_killable(lock_ctx->mm)) + return false; + + lock_ctx->mmap_locked = true; + } else { + rcu_read_lock(); + reset_lock_ctx(lock_ctx); + } + + return true; +} + +static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) +{ + if (lock_ctx->mmap_locked) { + mmap_read_unlock(lock_ctx->mm); + } else { + unlock_ctx_vma(lock_ctx); + rcu_read_unlock(); + } } -static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma) +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, + loff_t last_pos) { - if (m->count < m->size) /* vma is copied successfully */ - m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL; + struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx; + struct vm_area_struct *vma; + + if (lock_ctx->mmap_locked) + return vma_next(&priv->iter); + + unlock_ctx_vma(lock_ctx); + vma = lock_next_vma(lock_ctx->mm, &priv->iter, last_pos); + if (!IS_ERR_OR_NULL(vma)) + lock_ctx->locked_vma = vma; + + return vma; +} + +static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, + loff_t pos) +{ + struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx; + + if (lock_ctx->mmap_locked) + return false; + + rcu_read_unlock(); + mmap_read_lock(lock_ctx->mm); + /* Reinitialize the iterator after taking mmap_lock */ + vma_iter_set(&priv->iter, pos); + lock_ctx->mmap_locked = true; + + return true; +} + +#else /* CONFIG_PER_VMA_LOCK */ + +static inline bool lock_vma_range(struct seq_file *m, + struct proc_maps_locking_ctx *lock_ctx) +{ + return mmap_read_lock_killable(lock_ctx->mm) == 0; +} + +static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) +{ + mmap_read_unlock(lock_ctx->mm); +} + +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, + loff_t last_pos) +{ + return vma_next(&priv->iter); +} + +static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, + loff_t pos) +{ + return false; +} + +#endif /* CONFIG_PER_VMA_LOCK */ + +static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma; + +retry: + vma = get_next_vma(priv, *ppos); + /* EINTR of EAGAIN is possible */ + if (IS_ERR(vma)) { + if (PTR_ERR(vma) == -EAGAIN && fallback_to_mmap_lock(priv, *ppos)) + goto retry; + + return vma; + } + + /* Store previous position to be able to restart if needed */ + priv->last_pos = *ppos; + if (vma) { + /* + * Track the end of the reported vma to ensure position changes + * even if previous vma was merged with the next vma and we + * found the extended vma with the same vm_start. + */ + *ppos = vma->vm_end; + } else { + *ppos = SENTINEL_VMA_GATE; + vma = get_gate_vma(priv->lock_ctx.mm); + } + + return vma; } static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; - unsigned long last_addr = m->version; + struct proc_maps_locking_ctx *lock_ctx; + loff_t last_addr = *ppos; struct mm_struct *mm; - struct vm_area_struct *vma; - unsigned int pos = *ppos; - /* See m_cache_vma(). Zero at the start or after lseek. */ - if (last_addr == -1UL) + /* See m_next(). Zero at the start or after lseek. */ + if (last_addr == SENTINEL_VMA_END) return NULL; priv->task = get_proc_task(priv->inode); if (!priv->task) return ERR_PTR(-ESRCH); - mm = priv->mm; - if (!mm || !mmget_not_zero(mm)) + lock_ctx = &priv->lock_ctx; + mm = lock_ctx->mm; + if (!mm || !mmget_not_zero(mm)) { + put_task_struct(priv->task); + priv->task = NULL; return NULL; - - down_read(&mm->mmap_sem); - hold_task_mempolicy(priv); - priv->tail_vma = get_gate_vma(mm); - - if (last_addr) { - vma = find_vma(mm, last_addr - 1); - if (vma && vma->vm_start <= last_addr) - vma = m_next_vma(priv, vma); - if (vma) - return vma; } - m->version = 0; - if (pos < mm->map_count) { - for (vma = mm->mmap; pos; pos--) { - m->version = vma->vm_start; - vma = vma->vm_next; - } - return vma; + if (!lock_vma_range(m, lock_ctx)) { + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; + return ERR_PTR(-EINTR); } - /* we do not bother to update m->version in this case */ - if (pos == mm->map_count && priv->tail_vma) - return priv->tail_vma; + /* + * Reset current position if last_addr was set before + * and it's not a sentinel. + */ + if (last_addr > 0) + *ppos = last_addr = priv->last_pos; + vma_iter_init(&priv->iter, mm, (unsigned long)last_addr); + hold_task_mempolicy(priv); + if (last_addr == SENTINEL_VMA_GATE) + return get_gate_vma(mm); - vma_stop(priv); - return NULL; + return proc_get_vma(m, ppos); } -static void *m_next(struct seq_file *m, void *v, loff_t *pos) +static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { - struct proc_maps_private *priv = m->private; - struct vm_area_struct *next; - - (*pos)++; - next = m_next_vma(priv, v); - if (!next) - vma_stop(priv); - return next; + if (*ppos == SENTINEL_VMA_GATE) { + *ppos = SENTINEL_VMA_END; + return NULL; + } + return proc_get_vma(m, ppos); } static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; + struct mm_struct *mm = priv->lock_ctx.mm; - if (!IS_ERR_OR_NULL(v)) - vma_stop(priv); - if (priv->task) { - put_task_struct(priv->task); - priv->task = NULL; - } + if (!priv->task) + return; + + release_task_mempolicy(priv); + unlock_vma_range(&priv->lock_ctx); + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; } static int proc_maps_open(struct inode *inode, struct file *file, @@ -233,9 +349,9 @@ static int proc_maps_open(struct inode *inode, struct file *file, return -ENOMEM; priv->inode = inode; - priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - int err = PTR_ERR(priv->mm); + priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(priv->lock_ctx.mm)) { + int err = PTR_ERR(priv->lock_ctx.mm); seq_release_private(inode, file); return err; @@ -249,8 +365,8 @@ static int proc_map_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; - if (priv->mm) - mmdrop(priv->mm); + if (priv->lock_ctx.mm) + mmdrop(priv->lock_ctx.mm); return seq_release_private(inode, file); } @@ -262,37 +378,101 @@ static int do_maps_open(struct inode *inode, struct file *file, sizeof(struct proc_maps_private)); } -/* - * Indicate if the VMA is a stack for the given task; for - * /proc/PID/maps that is the stack of the main task. - */ -static int is_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma) +static void get_vma_name(struct vm_area_struct *vma, + const struct path **path, + const char **name, + const char **name_fmt) { + struct anon_vma_name *anon_name = vma->vm_mm ? anon_vma_name(vma) : NULL; + + *name = NULL; + *path = NULL; + *name_fmt = NULL; + /* - * We make no effort to guess what a given thread considers to be - * its "stack". It's not even well-defined for programs written - * languages like Go. + * Print the dentry name for named mappings, and a + * special [heap] marker for the heap: */ - return vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack; + if (vma->vm_file) { + /* + * If user named this anon shared memory via + * prctl(PR_SET_VMA ..., use the provided name. + */ + if (anon_name) { + *name_fmt = "[anon_shmem:%s]"; + *name = anon_name->name; + } else { + *path = file_user_path(vma->vm_file); + } + return; + } + + if (vma->vm_ops && vma->vm_ops->name) { + *name = vma->vm_ops->name(vma); + if (*name) + return; + } + + *name = arch_vma_name(vma); + if (*name) + return; + + if (!vma->vm_mm) { + *name = "[vdso]"; + return; + } + + if (vma_is_initial_heap(vma)) { + *name = "[heap]"; + return; + } + + if (vma_is_initial_stack(vma)) { + *name = "[stack]"; + return; + } + + if (anon_name) { + *name_fmt = "[anon:%s]"; + *name = anon_name->name; + return; + } +} + +static void show_vma_header_prefix(struct seq_file *m, + unsigned long start, unsigned long end, + vm_flags_t flags, unsigned long long pgoff, + dev_t dev, unsigned long ino) +{ + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); + seq_put_hex_ll(m, NULL, start, 8); + seq_put_hex_ll(m, "-", end, 8); + seq_putc(m, ' '); + seq_putc(m, flags & VM_READ ? 'r' : '-'); + seq_putc(m, flags & VM_WRITE ? 'w' : '-'); + seq_putc(m, flags & VM_EXEC ? 'x' : '-'); + seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p'); + seq_put_hex_ll(m, " ", pgoff, 8); + seq_put_hex_ll(m, " ", MAJOR(dev), 2); + seq_put_hex_ll(m, ":", MINOR(dev), 2); + seq_put_decimal_ull(m, " ", ino); + seq_putc(m, ' '); } static void -show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) +show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { - struct mm_struct *mm = vma->vm_mm; - struct file *file = vma->vm_file; - struct proc_maps_private *priv = m->private; + const struct path *path; + const char *name_fmt, *name; vm_flags_t flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; unsigned long start, end; dev_t dev = 0; - const char *name = NULL; - if (file) { - struct inode *inode = file_inode(vma->vm_file); + if (vma->vm_file) { + const struct inode *inode = file_user_inode(vma->vm_file); + dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; @@ -300,98 +480,340 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) start = vma->vm_start; end = vma->vm_end; + show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); - seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); - seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", - start, - end, - flags & VM_READ ? 'r' : '-', - flags & VM_WRITE ? 'w' : '-', - flags & VM_EXEC ? 'x' : '-', - flags & VM_MAYSHARE ? 's' : 'p', - pgoff, - MAJOR(dev), MINOR(dev), ino); - - /* - * Print the dentry name for named mappings, and a - * special [heap] marker for the heap: - */ - if (file) { + get_vma_name(vma, &path, &name, &name_fmt); + if (path) { + seq_pad(m, ' '); + seq_path(m, path, "\n"); + } else if (name_fmt) { seq_pad(m, ' '); - seq_file_path(m, file, "\n"); - goto done; + seq_printf(m, name_fmt, name); + } else if (name) { + seq_pad(m, ' '); + seq_puts(m, name); } + seq_putc(m, '\n'); +} - if (vma->vm_ops && vma->vm_ops->name) { - name = vma->vm_ops->name(vma); - if (name) - goto done; +static int show_map(struct seq_file *m, void *v) +{ + show_map_vma(m, v); + return 0; +} + +static const struct seq_operations proc_pid_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_map +}; + +static int pid_maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_maps_op); +} + +#define PROCMAP_QUERY_VMA_FLAGS ( \ + PROCMAP_QUERY_VMA_READABLE | \ + PROCMAP_QUERY_VMA_WRITABLE | \ + PROCMAP_QUERY_VMA_EXECUTABLE | \ + PROCMAP_QUERY_VMA_SHARED \ +) + +#define PROCMAP_QUERY_VALID_FLAGS_MASK ( \ + PROCMAP_QUERY_COVERING_OR_NEXT_VMA | \ + PROCMAP_QUERY_FILE_BACKED_VMA | \ + PROCMAP_QUERY_VMA_FLAGS \ +) + +#ifdef CONFIG_PER_VMA_LOCK + +static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) +{ + reset_lock_ctx(lock_ctx); + + return 0; +} + +static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) +{ + if (lock_ctx->mmap_locked) { + mmap_read_unlock(lock_ctx->mm); + lock_ctx->mmap_locked = false; + } else { + unlock_ctx_vma(lock_ctx); } +} - name = arch_vma_name(vma); - if (!name) { - if (!mm) { - name = "[vdso]"; - goto done; - } +static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx, + unsigned long addr) +{ + struct mm_struct *mm = lock_ctx->mm; + struct vm_area_struct *vma; + struct vma_iterator vmi; - if (vma->vm_start <= mm->brk && - vma->vm_end >= mm->start_brk) { - name = "[heap]"; - goto done; - } + if (lock_ctx->mmap_locked) + return find_vma(mm, addr); - if (is_stack(priv, vma)) - name = "[stack]"; + /* Unlock previously locked VMA and find the next one under RCU */ + unlock_ctx_vma(lock_ctx); + rcu_read_lock(); + vma_iter_init(&vmi, mm, addr); + vma = lock_next_vma(mm, &vmi, addr); + rcu_read_unlock(); + + if (!vma) + return NULL; + + if (!IS_ERR(vma)) { + lock_ctx->locked_vma = vma; + return vma; } -done: - if (name) { - seq_pad(m, ' '); - seq_puts(m, name); + if (PTR_ERR(vma) == -EAGAIN) { + /* Fallback to mmap_lock on vma->vm_refcnt overflow */ + mmap_read_lock(mm); + vma = find_vma(mm, addr); + lock_ctx->mmap_locked = true; } - seq_putc(m, '\n'); + + return vma; } -static int show_map(struct seq_file *m, void *v, int is_pid) +#else /* CONFIG_PER_VMA_LOCK */ + +static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) { - show_map_vma(m, v, is_pid); - m_cache_vma(m, v); - return 0; + return mmap_read_lock_killable(lock_ctx->mm); } -static int show_pid_map(struct seq_file *m, void *v) +static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) { - return show_map(m, v, 1); + mmap_read_unlock(lock_ctx->mm); } -static int show_tid_map(struct seq_file *m, void *v) +static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx, + unsigned long addr) { - return show_map(m, v, 0); + return find_vma(lock_ctx->mm, addr); } -static const struct seq_operations proc_pid_maps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_pid_map -}; +#endif /* CONFIG_PER_VMA_LOCK */ -static const struct seq_operations proc_tid_maps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_tid_map -}; +static struct vm_area_struct *query_matching_vma(struct proc_maps_locking_ctx *lock_ctx, + unsigned long addr, u32 flags) +{ + struct vm_area_struct *vma; -static int pid_maps_open(struct inode *inode, struct file *file) +next_vma: + vma = query_vma_find_by_addr(lock_ctx, addr); + if (IS_ERR(vma)) + return vma; + + if (!vma) + goto no_vma; + + /* user requested only file-backed VMA, keep iterating */ + if ((flags & PROCMAP_QUERY_FILE_BACKED_VMA) && !vma->vm_file) + goto skip_vma; + + /* VMA permissions should satisfy query flags */ + if (flags & PROCMAP_QUERY_VMA_FLAGS) { + u32 perm = 0; + + if (flags & PROCMAP_QUERY_VMA_READABLE) + perm |= VM_READ; + if (flags & PROCMAP_QUERY_VMA_WRITABLE) + perm |= VM_WRITE; + if (flags & PROCMAP_QUERY_VMA_EXECUTABLE) + perm |= VM_EXEC; + if (flags & PROCMAP_QUERY_VMA_SHARED) + perm |= VM_MAYSHARE; + + if ((vma->vm_flags & perm) != perm) + goto skip_vma; + } + + /* found covering VMA or user is OK with the matching next VMA */ + if ((flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) || vma->vm_start <= addr) + return vma; + +skip_vma: + /* + * If the user needs closest matching VMA, keep iterating. + */ + addr = vma->vm_end; + if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) + goto next_vma; + +no_vma: + return ERR_PTR(-ENOENT); +} + +static int do_procmap_query(struct mm_struct *mm, void __user *uarg) { - return do_maps_open(inode, file, &proc_pid_maps_op); + struct proc_maps_locking_ctx lock_ctx = { .mm = mm }; + struct procmap_query karg; + struct vm_area_struct *vma; + const char *name = NULL; + char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL; + __u64 usize; + int err; + + if (copy_from_user(&usize, (void __user *)uarg, sizeof(usize))) + return -EFAULT; + /* argument struct can never be that large, reject abuse */ + if (usize > PAGE_SIZE) + return -E2BIG; + /* argument struct should have at least query_flags and query_addr fields */ + if (usize < offsetofend(struct procmap_query, query_addr)) + return -EINVAL; + err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); + if (err) + return err; + + /* reject unknown flags */ + if (karg.query_flags & ~PROCMAP_QUERY_VALID_FLAGS_MASK) + return -EINVAL; + /* either both buffer address and size are set, or both should be zero */ + if (!!karg.vma_name_size != !!karg.vma_name_addr) + return -EINVAL; + if (!!karg.build_id_size != !!karg.build_id_addr) + return -EINVAL; + + if (!mm || !mmget_not_zero(mm)) + return -ESRCH; + + err = query_vma_setup(&lock_ctx); + if (err) { + mmput(mm); + return err; + } + + vma = query_matching_vma(&lock_ctx, karg.query_addr, karg.query_flags); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + vma = NULL; + goto out; + } + + karg.vma_start = vma->vm_start; + karg.vma_end = vma->vm_end; + + karg.vma_flags = 0; + if (vma->vm_flags & VM_READ) + karg.vma_flags |= PROCMAP_QUERY_VMA_READABLE; + if (vma->vm_flags & VM_WRITE) + karg.vma_flags |= PROCMAP_QUERY_VMA_WRITABLE; + if (vma->vm_flags & VM_EXEC) + karg.vma_flags |= PROCMAP_QUERY_VMA_EXECUTABLE; + if (vma->vm_flags & VM_MAYSHARE) + karg.vma_flags |= PROCMAP_QUERY_VMA_SHARED; + + karg.vma_page_size = vma_kernel_pagesize(vma); + + if (vma->vm_file) { + const struct inode *inode = file_user_inode(vma->vm_file); + + karg.vma_offset = ((__u64)vma->vm_pgoff) << PAGE_SHIFT; + karg.dev_major = MAJOR(inode->i_sb->s_dev); + karg.dev_minor = MINOR(inode->i_sb->s_dev); + karg.inode = inode->i_ino; + } else { + karg.vma_offset = 0; + karg.dev_major = 0; + karg.dev_minor = 0; + karg.inode = 0; + } + + if (karg.build_id_size) { + __u32 build_id_sz; + + err = build_id_parse(vma, build_id_buf, &build_id_sz); + if (err) { + karg.build_id_size = 0; + } else { + if (karg.build_id_size < build_id_sz) { + err = -ENAMETOOLONG; + goto out; + } + karg.build_id_size = build_id_sz; + } + } + + if (karg.vma_name_size) { + size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size); + const struct path *path; + const char *name_fmt; + size_t name_sz = 0; + + get_vma_name(vma, &path, &name, &name_fmt); + + if (path || name_fmt || name) { + name_buf = kmalloc(name_buf_sz, GFP_KERNEL); + if (!name_buf) { + err = -ENOMEM; + goto out; + } + } + if (path) { + name = d_path(path, name_buf, name_buf_sz); + if (IS_ERR(name)) { + err = PTR_ERR(name); + goto out; + } + name_sz = name_buf + name_buf_sz - name; + } else if (name || name_fmt) { + name_sz = 1 + snprintf(name_buf, name_buf_sz, name_fmt ?: "%s", name); + name = name_buf; + } + if (name_sz > name_buf_sz) { + err = -ENAMETOOLONG; + goto out; + } + karg.vma_name_size = name_sz; + } + + /* unlock vma or mmap_lock, and put mm_struct before copying data to user */ + query_vma_teardown(&lock_ctx); + mmput(mm); + + if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr), + name, karg.vma_name_size)) { + kfree(name_buf); + return -EFAULT; + } + kfree(name_buf); + + if (karg.build_id_size && copy_to_user(u64_to_user_ptr(karg.build_id_addr), + build_id_buf, karg.build_id_size)) + return -EFAULT; + + if (copy_to_user(uarg, &karg, min_t(size_t, sizeof(karg), usize))) + return -EFAULT; + + return 0; + +out: + query_vma_teardown(&lock_ctx); + mmput(mm); + kfree(name_buf); + return err; } -static int tid_maps_open(struct inode *inode, struct file *file) +static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - return do_maps_open(inode, file, &proc_tid_maps_op); + struct seq_file *seq = file->private_data; + struct proc_maps_private *priv = seq->private; + + switch (cmd) { + case PROCMAP_QUERY: + /* priv->lock_ctx.mm is set during file open operation */ + return do_procmap_query(priv->lock_ctx.mm, (void __user *)arg); + default: + return -ENOIOCTLCMD; + } } const struct file_operations proc_pid_maps_operations = { @@ -399,13 +821,8 @@ const struct file_operations proc_pid_maps_operations = { .read = seq_read, .llseek = seq_lseek, .release = proc_map_release, -}; - -const struct file_operations proc_tid_maps_operations = { - .open = tid_maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = proc_map_release, + .unlocked_ioctl = procfs_procmap_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; /* @@ -439,94 +856,177 @@ struct mem_size_stats { unsigned long lazyfree; unsigned long anonymous_thp; unsigned long shmem_thp; + unsigned long file_thp; unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; + unsigned long ksm; u64 pss; + u64 pss_anon; + u64 pss_file; + u64 pss_shmem; + u64 pss_dirty; + u64 pss_locked; u64 swap_pss; - bool check_shmem_swap; }; +static void smaps_page_accumulate(struct mem_size_stats *mss, + struct folio *folio, unsigned long size, unsigned long pss, + bool dirty, bool locked, bool private) +{ + mss->pss += pss; + + if (folio_test_anon(folio)) + mss->pss_anon += pss; + else if (folio_test_swapbacked(folio)) + mss->pss_shmem += pss; + else + mss->pss_file += pss; + + if (locked) + mss->pss_locked += pss; + + if (dirty || folio_test_dirty(folio)) { + mss->pss_dirty += pss; + if (private) + mss->private_dirty += size; + else + mss->shared_dirty += size; + } else { + if (private) + mss->private_clean += size; + else + mss->shared_clean += size; + } +} + static void smaps_account(struct mem_size_stats *mss, struct page *page, - bool compound, bool young, bool dirty) + bool compound, bool young, bool dirty, bool locked, + bool present) { - int i, nr = compound ? 1 << compound_order(page) : 1; + struct folio *folio = page_folio(page); + int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; + bool exclusive; + int mapcount; - if (PageAnon(page)) { + /* + * First accumulate quantities that depend only on |size| and the type + * of the compound page. + */ + if (folio_test_anon(folio)) { mss->anonymous += size; - if (!PageSwapBacked(page) && !dirty && !PageDirty(page)) + if (!folio_test_swapbacked(folio) && !dirty && + !folio_test_dirty(folio)) mss->lazyfree += size; } + if (folio_test_ksm(folio)) + mss->ksm += size; + mss->resident += size; /* Accumulate the size in pages that have been accessed. */ - if (young || page_is_young(page) || PageReferenced(page)) + if (young || folio_test_young(folio) || folio_test_referenced(folio)) mss->referenced += size; /* - * page_count(page) == 1 guarantees the page is mapped exactly once. - * If any subpage of the compound page mapped with PTE it would elevate - * page_count(). + * Then accumulate quantities that may depend on sharing, or that may + * differ page-by-page. + * + * refcount == 1 for present entries guarantees that the folio is mapped + * exactly once. For large folios this implies that exactly one + * PTE/PMD/... maps (a part of) this folio. + * + * Treat all non-present entries (where relying on the mapcount and + * refcount doesn't make sense) as "maybe shared, but not sure how + * often". We treat device private entries as being fake-present. + * + * Note that it would not be safe to read the mapcount especially for + * pages referenced by migration entries, even with the PTL held. */ - if (page_count(page) == 1) { - if (dirty || PageDirty(page)) - mss->private_dirty += size; - else - mss->private_clean += size; - mss->pss += (u64)size << PSS_SHIFT; + if (folio_ref_count(folio) == 1 || !present) { + smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT, + dirty, locked, present); return; } + if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { + mapcount = folio_average_page_mapcount(folio); + exclusive = !folio_maybe_mapped_shared(folio); + } + + /* + * We obtain a snapshot of the mapcount. Without holding the folio lock + * this snapshot can be slightly wrong as we cannot always read the + * mapcount atomically. + */ for (i = 0; i < nr; i++, page++) { - int mapcount = page_mapcount(page); + unsigned long pss = PAGE_SIZE << PSS_SHIFT; - if (mapcount >= 2) { - if (dirty || PageDirty(page)) - mss->shared_dirty += PAGE_SIZE; - else - mss->shared_clean += PAGE_SIZE; - mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; - } else { - if (dirty || PageDirty(page)) - mss->private_dirty += PAGE_SIZE; - else - mss->private_clean += PAGE_SIZE; - mss->pss += PAGE_SIZE << PSS_SHIFT; + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) { + mapcount = folio_precise_page_mapcount(folio, page); + exclusive = mapcount < 2; } + + if (mapcount >= 2) + pss /= mapcount; + smaps_page_accumulate(mss, folio, PAGE_SIZE, pss, + dirty, locked, exclusive); } } #ifdef CONFIG_SHMEM static int smaps_pte_hole(unsigned long addr, unsigned long end, - struct mm_walk *walk) + __always_unused int depth, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; - mss->swap += shmem_partial_swap_usage( - walk->vma->vm_file->f_mapping, addr, end); + mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping, + linear_page_index(vma, addr), + linear_page_index(vma, end)); return 0; } +#else +#define smaps_pte_hole NULL +#endif /* CONFIG_SHMEM */ + +static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk) +{ +#ifdef CONFIG_SHMEM + if (walk->ops->pte_hole) { + /* depth is not used */ + smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk); + } #endif +} static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; + bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; + bool present = false, young = false, dirty = false; + pte_t ptent = ptep_get(pte); - if (pte_present(*pte)) { - page = vm_normal_page(vma, addr, *pte); - } else if (is_swap_pte(*pte)) { - swp_entry_t swpent = pte_to_swp_entry(*pte); + if (pte_present(ptent)) { + page = vm_normal_page(vma, addr, ptent); + young = pte_young(ptent); + dirty = pte_dirty(ptent); + present = true; + } else if (pte_none(ptent)) { + smaps_pte_hole_lookup(addr, walk); + } else { + const softleaf_t entry = softleaf_from_pte(ptent); - if (!non_swap_entry(swpent)) { + if (softleaf_is_swap(entry)) { int mapcount; mss->swap += PAGE_SIZE; - mapcount = swp_swapcount(swpent); + mapcount = swp_swapcount(entry); if (mapcount >= 2) { u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; @@ -535,27 +1035,17 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else { mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; } - } else if (is_migration_entry(swpent)) - page = migration_entry_to_page(swpent); - } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap - && pte_none(*pte))) { - page = find_get_entry(vma->vm_file->f_mapping, - linear_page_index(vma, addr)); - if (!page) - return; - - if (radix_tree_exceptional_entry(page)) - mss->swap += PAGE_SIZE; - else - put_page(page); - - return; + } else if (softleaf_has_pfn(entry)) { + if (softleaf_is_device_private(entry)) + present = true; + page = softleaf_to_page(entry); + } } if (!page) return; - smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte)); + smaps_account(mss, page, false, young, dirty, locked, present); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -564,21 +1054,36 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; - struct page *page; + bool locked = !!(vma->vm_flags & VM_LOCKED); + struct page *page = NULL; + bool present = false; + struct folio *folio; - /* FOLL_DUMP will return -EFAULT on huge zero page */ - page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + if (pmd_none(*pmd)) + return; + if (pmd_present(*pmd)) { + page = vm_normal_page_pmd(vma, addr, *pmd); + present = true; + } else if (unlikely(thp_migration_supported())) { + const softleaf_t entry = softleaf_from_pmd(*pmd); + + if (softleaf_has_pfn(entry)) + page = softleaf_to_page(entry); + } if (IS_ERR_OR_NULL(page)) return; - if (PageAnon(page)) + folio = page_folio(page); + if (folio_test_anon(folio)) mss->anonymous_thp += HPAGE_PMD_SIZE; - else if (PageSwapBacked(page)) + else if (folio_test_swapbacked(folio)) mss->shmem_thp += HPAGE_PMD_SIZE; - else if (is_zone_device_page(page)) + else if (folio_is_zone_device(folio)) /* pass */; else - VM_BUG_ON_PAGE(1, page); - smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); + mss->file_thp += HPAGE_PMD_SIZE; + + smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), + locked, present); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, @@ -598,20 +1103,18 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (ptl) { smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); - return 0; + goto out; } - if (pmd_trans_unstable(pmd)) - return 0; - /* - * The mmap_sem held all the way back in m_start() is what - * keeps khugepaged out of here and from collapsing things - * in here. - */ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return 0; + } for (; addr != end; pte++, addr += PAGE_SIZE) smaps_pte_entry(pte, addr, walk); pte_unmap_unlock(pte - 1, ptl); +out: cond_resched(); return 0; } @@ -620,8 +1123,15 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) { /* * Don't forget to update Documentation/ on changes. + * + * The length of the second argument of mnemonics[] + * needs to be 3 instead of previously set 2 + * (i.e. from [BITS_PER_LONG][2] to [BITS_PER_LONG][3]) + * to avoid spurious + * -Werror=unterminated-string-initialization warning + * with GCC 15 */ - static const char mnemonics[BITS_PER_LONG][2] = { + static const char mnemonics[BITS_PER_LONG][3] = { /* * In case if we meet a flag we don't know about. */ @@ -637,21 +1147,24 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_MAYSHARE)] = "ms", [ilog2(VM_GROWSDOWN)] = "gd", [ilog2(VM_PFNMAP)] = "pf", - [ilog2(VM_DENYWRITE)] = "dw", -#ifdef CONFIG_X86_INTEL_MPX - [ilog2(VM_MPX)] = "mp", -#endif + [ilog2(VM_MAYBE_GUARD)] = "gu", [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", [ilog2(VM_RAND_READ)] = "rr", [ilog2(VM_DONTCOPY)] = "dc", [ilog2(VM_DONTEXPAND)] = "de", + [ilog2(VM_LOCKONFAULT)] = "lf", [ilog2(VM_ACCOUNT)] = "ac", [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", + [ilog2(VM_SYNC)] = "sf", [ilog2(VM_ARCH_1)] = "ar", + [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)] = "dd", +#ifdef CONFIG_ARM64_BTI + [ilog2(VM_ARM64_BTI)] = "bt", +#endif #ifdef CONFIG_MEM_SOFT_DIRTY [ilog2(VM_SOFTDIRTY)] = "sd", #endif @@ -661,13 +1174,34 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_MERGEABLE)] = "mg", [ilog2(VM_UFFD_MISSING)]= "um", [ilog2(VM_UFFD_WP)] = "uw", -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +#ifdef CONFIG_ARM64_MTE + [ilog2(VM_MTE)] = "mt", + [ilog2(VM_MTE_ALLOWED)] = "", +#endif +#ifdef CONFIG_ARCH_HAS_PKEYS /* These come out via ProtectionKey: */ [ilog2(VM_PKEY_BIT0)] = "", [ilog2(VM_PKEY_BIT1)] = "", [ilog2(VM_PKEY_BIT2)] = "", +#if CONFIG_ARCH_PKEY_BITS > 3 [ilog2(VM_PKEY_BIT3)] = "", #endif +#if CONFIG_ARCH_PKEY_BITS > 4 + [ilog2(VM_PKEY_BIT4)] = "", +#endif +#endif /* CONFIG_ARCH_HAS_PKEYS */ +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR + [ilog2(VM_UFFD_MINOR)] = "ui", +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK + [ilog2(VM_SHADOW_STACK)] = "ss", +#endif +#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) + [ilog2(VM_DROPPABLE)] = "dp", +#endif +#ifdef CONFIG_64BIT + [ilog2(VM_SEALED)] = "sl", +#endif }; size_t i; @@ -675,10 +1209,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) for (i = 0; i < BITS_PER_LONG; i++) { if (!mnemonics[i][0]) continue; - if (vma->vm_flags & (1UL << i)) { - seq_printf(m, "%c%c ", - mnemonics[i][0], mnemonics[i][1]); - } + if (vma->vm_flags & (1UL << i)) + seq_printf(m, "%s ", mnemonics[i]); } seq_putc(m, '\n'); } @@ -690,48 +1222,66 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; - struct page *page = NULL; + struct folio *folio = NULL; + bool present = false; + spinlock_t *ptl; + pte_t ptent; - if (pte_present(*pte)) { - page = vm_normal_page(vma, addr, *pte); - } else if (is_swap_pte(*pte)) { - swp_entry_t swpent = pte_to_swp_entry(*pte); + ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); + ptent = huge_ptep_get(walk->mm, addr, pte); + if (pte_present(ptent)) { + folio = page_folio(pte_page(ptent)); + present = true; + } else { + const softleaf_t entry = softleaf_from_pte(ptent); - if (is_migration_entry(swpent)) - page = migration_entry_to_page(swpent); + if (softleaf_has_pfn(entry)) + folio = softleaf_to_folio(entry); } - if (page) { - int mapcount = page_mapcount(page); - if (mapcount >= 2) + if (folio) { + /* We treat non-present entries as "maybe shared". */ + if (!present || folio_maybe_mapped_shared(folio) || + hugetlb_pmd_shared(pte)) mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); else mss->private_hugetlb += huge_page_size(hstate_vma(vma)); } + spin_unlock(ptl); return 0; } +#else +#define smaps_hugetlb_range NULL #endif /* HUGETLB_PAGE */ -void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma) -{ -} +static const struct mm_walk_ops smaps_walk_ops = { + .pmd_entry = smaps_pte_range, + .hugetlb_entry = smaps_hugetlb_range, + .walk_lock = PGWALK_RDLOCK, +}; -static int show_smap(struct seq_file *m, void *v, int is_pid) +static const struct mm_walk_ops smaps_shmem_walk_ops = { + .pmd_entry = smaps_pte_range, + .hugetlb_entry = smaps_hugetlb_range, + .pte_hole = smaps_pte_hole, + .walk_lock = PGWALK_RDLOCK, +}; + +/* + * Gather mem stats from @vma with the indicated beginning + * address @start, and keep them in @mss. + * + * Use vm_start of @vma as the beginning address if @start is 0. + */ +static void smap_gather_stats(struct vm_area_struct *vma, + struct mem_size_stats *mss, unsigned long start) { - struct vm_area_struct *vma = v; - struct mem_size_stats mss; - struct mm_walk smaps_walk = { - .pmd_entry = smaps_pte_range, -#ifdef CONFIG_HUGETLB_PAGE - .hugetlb_entry = smaps_hugetlb_range, -#endif - .mm = vma->vm_mm, - .private = &mss, - }; + const struct mm_walk_ops *ops = &smaps_walk_ops; - memset(&mss, 0, sizeof mss); + /* Invalid start */ + if (start >= vma->vm_end) + return; -#ifdef CONFIG_SHMEM if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all @@ -745,90 +1295,220 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) */ unsigned long shmem_swapped = shmem_swap_usage(vma); - if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || - !(vma->vm_flags & VM_WRITE)) { - mss.swap = shmem_swapped; + if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) || + !(vma->vm_flags & VM_WRITE))) { + mss->swap += shmem_swapped; } else { - mss.check_shmem_swap = true; - smaps_walk.pte_hole = smaps_pte_hole; + ops = &smaps_shmem_walk_ops; } } -#endif - /* mmap_sem is held in m_start */ - walk_page_vma(vma, &smaps_walk); - - show_map_vma(m, vma, is_pid); - - seq_printf(m, - "Size: %8lu kB\n" - "Rss: %8lu kB\n" - "Pss: %8lu kB\n" - "Shared_Clean: %8lu kB\n" - "Shared_Dirty: %8lu kB\n" - "Private_Clean: %8lu kB\n" - "Private_Dirty: %8lu kB\n" - "Referenced: %8lu kB\n" - "Anonymous: %8lu kB\n" - "LazyFree: %8lu kB\n" - "AnonHugePages: %8lu kB\n" - "ShmemPmdMapped: %8lu kB\n" - "Shared_Hugetlb: %8lu kB\n" - "Private_Hugetlb: %7lu kB\n" - "Swap: %8lu kB\n" - "SwapPss: %8lu kB\n" - "KernelPageSize: %8lu kB\n" - "MMUPageSize: %8lu kB\n" - "Locked: %8lu kB\n", - (vma->vm_end - vma->vm_start) >> 10, - mss.resident >> 10, - (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), - mss.shared_clean >> 10, - mss.shared_dirty >> 10, - mss.private_clean >> 10, - mss.private_dirty >> 10, - mss.referenced >> 10, - mss.anonymous >> 10, - mss.lazyfree >> 10, - mss.anonymous_thp >> 10, - mss.shmem_thp >> 10, - mss.shared_hugetlb >> 10, - mss.private_hugetlb >> 10, - mss.swap >> 10, - (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), - vma_kernel_pagesize(vma) >> 10, - vma_mmu_pagesize(vma) >> 10, - (vma->vm_flags & VM_LOCKED) ? - (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); - - arch_show_smap(m, vma); - show_smap_vma_flags(m, vma); - m_cache_vma(m, vma); - return 0; + /* mmap_lock is held in m_start */ + if (!start) + walk_page_vma(vma, ops, mss); + else + walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); } -static int show_pid_smap(struct seq_file *m, void *v) +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) >> 10, 8) + +/* Show the contents common for smaps and smaps_rollup */ +static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, + bool rollup_mode) { - return show_smap(m, v, 1); + SEQ_PUT_DEC("Rss: ", mss->resident); + SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT); + if (rollup_mode) { + /* + * These are meaningful only for smaps_rollup, otherwise two of + * them are zero, and the other one is the same as Pss. + */ + SEQ_PUT_DEC(" kB\nPss_Anon: ", + mss->pss_anon >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nPss_File: ", + mss->pss_file >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nPss_Shmem: ", + mss->pss_shmem >> PSS_SHIFT); + } + SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); + SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); + SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); + SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); + SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); + SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); + SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm); + SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); + SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); + SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); + SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); + SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); + seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", + mss->private_hugetlb >> 10, 7); + SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); + SEQ_PUT_DEC(" kB\nSwapPss: ", + mss->swap_pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nLocked: ", + mss->pss_locked >> PSS_SHIFT); + seq_puts(m, " kB\n"); +} + +static int show_smap(struct seq_file *m, void *v) +{ + struct vm_area_struct *vma = v; + struct mem_size_stats mss = {}; + + smap_gather_stats(vma, &mss, 0); + + show_map_vma(m, vma); + + SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); + SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); + SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); + seq_puts(m, " kB\n"); + + __show_smap(m, &mss, false); + + seq_printf(m, "THPeligible: %8u\n", + !!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS, + THP_ORDERS_ALL)); + + if (arch_pkeys_enabled()) + seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); + show_smap_vma_flags(m, vma); + + return 0; } -static int show_tid_smap(struct seq_file *m, void *v) +static int show_smaps_rollup(struct seq_file *m, void *v) { - return show_smap(m, v, 0); + struct proc_maps_private *priv = m->private; + struct mem_size_stats mss = {}; + struct mm_struct *mm = priv->lock_ctx.mm; + struct vm_area_struct *vma; + unsigned long vma_start = 0, last_vma_end = 0; + int ret = 0; + VMA_ITERATOR(vmi, mm, 0); + + priv->task = get_proc_task(priv->inode); + if (!priv->task) + return -ESRCH; + + if (!mm || !mmget_not_zero(mm)) { + ret = -ESRCH; + goto out_put_task; + } + + ret = mmap_read_lock_killable(mm); + if (ret) + goto out_put_mm; + + hold_task_mempolicy(priv); + vma = vma_next(&vmi); + + if (unlikely(!vma)) + goto empty_set; + + vma_start = vma->vm_start; + do { + smap_gather_stats(vma, &mss, 0); + last_vma_end = vma->vm_end; + + /* + * Release mmap_lock temporarily if someone wants to + * access it for write request. + */ + if (mmap_lock_is_contended(mm)) { + vma_iter_invalidate(&vmi); + mmap_read_unlock(mm); + ret = mmap_read_lock_killable(mm); + if (ret) { + release_task_mempolicy(priv); + goto out_put_mm; + } + + /* + * After dropping the lock, there are four cases to + * consider. See the following example for explanation. + * + * +------+------+-----------+ + * | VMA1 | VMA2 | VMA3 | + * +------+------+-----------+ + * | | | | + * 4k 8k 16k 400k + * + * Suppose we drop the lock after reading VMA2 due to + * contention, then we get: + * + * last_vma_end = 16k + * + * 1) VMA2 is freed, but VMA3 exists: + * + * vma_next(vmi) will return VMA3. + * In this case, just continue from VMA3. + * + * 2) VMA2 still exists: + * + * vma_next(vmi) will return VMA3. + * In this case, just continue from VMA3. + * + * 3) No more VMAs can be found: + * + * vma_next(vmi) will return NULL. + * No more things to do, just break. + * + * 4) (last_vma_end - 1) is the middle of a vma (VMA'): + * + * vma_next(vmi) will return VMA' whose range + * contains last_vma_end. + * Iterate VMA' from last_vma_end. + */ + vma = vma_next(&vmi); + /* Case 3 above */ + if (!vma) + break; + + /* Case 1 and 2 above */ + if (vma->vm_start >= last_vma_end) { + smap_gather_stats(vma, &mss, 0); + last_vma_end = vma->vm_end; + continue; + } + + /* Case 4 above */ + if (vma->vm_end > last_vma_end) { + smap_gather_stats(vma, &mss, last_vma_end); + last_vma_end = vma->vm_end; + } + } + } for_each_vma(vmi, vma); + +empty_set: + show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); + seq_pad(m, ' '); + seq_puts(m, "[rollup]\n"); + + __show_smap(m, &mss, true); + + release_task_mempolicy(priv); + mmap_read_unlock(mm); + +out_put_mm: + mmput(mm); +out_put_task: + put_task_struct(priv->task); + priv->task = NULL; + + return ret; } +#undef SEQ_PUT_DEC static const struct seq_operations proc_pid_smaps_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_pid_smap -}; - -static const struct seq_operations proc_tid_smaps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_tid_smap + .show = show_smap }; static int pid_smaps_open(struct inode *inode, struct file *file) @@ -836,9 +1516,45 @@ static int pid_smaps_open(struct inode *inode, struct file *file) return do_maps_open(inode, file, &proc_pid_smaps_op); } -static int tid_smaps_open(struct inode *inode, struct file *file) +static int smaps_rollup_open(struct inode *inode, struct file *file) { - return do_maps_open(inode, file, &proc_tid_smaps_op); + int ret; + struct proc_maps_private *priv; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT); + if (!priv) + return -ENOMEM; + + ret = single_open(file, show_smaps_rollup, priv); + if (ret) + goto out_free; + + priv->inode = inode; + priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) { + ret = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH; + + single_release(inode, file); + goto out_free; + } + + return 0; + +out_free: + kfree(priv); + return ret; +} + +static int smaps_rollup_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct proc_maps_private *priv = seq->private; + + if (priv->lock_ctx.mm) + mmdrop(priv->lock_ctx.mm); + + kfree(priv); + return single_release(inode, file); } const struct file_operations proc_pid_smaps_operations = { @@ -848,11 +1564,11 @@ const struct file_operations proc_pid_smaps_operations = { .release = proc_map_release, }; -const struct file_operations proc_tid_smaps_operations = { - .open = tid_smaps_open, +const struct file_operations proc_pid_smaps_rollup_operations = { + .open = smaps_rollup_open, .read = seq_read, .llseek = seq_lseek, - .release = proc_map_release, + .release = smaps_rollup_release, }; enum clear_refs_types { @@ -868,52 +1584,78 @@ struct clear_refs_private { enum clear_refs_types type; }; -#ifdef CONFIG_MEM_SOFT_DIRTY +static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) +{ + struct folio *folio; + + if (!pte_write(pte)) + return false; + if (!is_cow_mapping(vma->vm_flags)) + return false; + if (likely(!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm))) + return false; + folio = vm_normal_folio(vma, addr, pte); + if (!folio) + return false; + return folio_maybe_dma_pinned(folio); +} + static inline void clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { + if (!pgtable_supports_soft_dirty()) + return; /* * The soft-dirty tracker uses #PF-s to catch writes * to pages, so write-protect the pte as well. See the - * Documentation/vm/soft-dirty.txt for full description + * Documentation/admin-guide/mm/soft-dirty.rst for full description * of how soft-dirty works. */ - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); + + if (pte_none(ptent)) + return; if (pte_present(ptent)) { - ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte); - ptent = pte_wrprotect(ptent); + pte_t old_pte; + + if (pte_is_pinned(vma, addr, ptent)) + return; + old_pte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_wrprotect(old_pte); ptent = pte_clear_soft_dirty(ptent); - ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent); - } else if (is_swap_pte(ptent)) { + ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); + } else { ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); } } -#else -static inline void clear_soft_dirty(struct vm_area_struct *vma, - unsigned long addr, pte_t *pte) -{ -} -#endif -#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = *pmdp; + pmd_t old, pmd = *pmdp; - /* See comment in change_huge_pmd() */ - pmdp_invalidate(vma, addr, pmdp); - if (pmd_dirty(*pmdp)) - pmd = pmd_mkdirty(pmd); - if (pmd_young(*pmdp)) - pmd = pmd_mkyoung(pmd); - - pmd = pmd_wrprotect(pmd); - pmd = pmd_clear_soft_dirty(pmd); + if (!pgtable_supports_soft_dirty()) + return; - set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + if (pmd_present(pmd)) { + /* See comment in change_huge_pmd() */ + old = pmdp_invalidate(vma, addr, pmdp); + if (pmd_dirty(old)) + pmd = pmd_mkdirty(pmd); + if (pmd_young(old)) + pmd = pmd_mkyoung(pmd); + + pmd = pmd_wrprotect(pmd); + pmd = pmd_clear_soft_dirty(pmd); + + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } else if (pmd_is_migration_entry(pmd)) { + pmd = pmd_swp_clear_soft_dirty(pmd); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } } #else static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, @@ -929,7 +1671,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; pte_t *pte, ptent; spinlock_t *ptl; - struct page *page; + struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -938,23 +1680,27 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, goto out; } - page = pmd_page(*pmd); + if (!pmd_present(*pmd)) + goto out; + + folio = pmd_folio(*pmd); /* Clear accessed and referenced bits. */ pmdp_test_and_clear_young(vma, addr, pmd); - test_and_clear_page_young(page); - ClearPageReferenced(page); + folio_test_clear_young(folio); + folio_clear_referenced(folio); out: spin_unlock(ptl); return 0; } - if (pmd_trans_unstable(pmd)) - return 0; - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return 0; + } for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + ptent = ptep_get(pte); if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); @@ -964,14 +1710,14 @@ out: if (!pte_present(ptent)) continue; - page = vm_normal_page(vma, addr, ptent); - if (!page) + folio = vm_normal_folio(vma, addr, ptent); + if (!folio) continue; /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); - test_and_clear_page_young(page); - ClearPageReferenced(page); + folio_test_clear_young(folio); + folio_clear_referenced(folio); } pte_unmap_unlock(pte - 1, ptl); cond_resched(); @@ -1000,18 +1746,23 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end, return 0; } +static const struct mm_walk_ops clear_refs_walk_ops = { + .pmd_entry = clear_refs_pte_range, + .test_walk = clear_refs_test_walk, + .walk_lock = PGWALK_WRLOCK, +}; + static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; struct mm_struct *mm; struct vm_area_struct *vma; enum clear_refs_types type; int itype; int rv; - memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) @@ -1028,55 +1779,46 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { + VMA_ITERATOR(vmi, mm, 0); + struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, }; - struct mm_walk clear_refs_walk = { - .pmd_entry = clear_refs_pte_range, - .test_walk = clear_refs_test_walk, - .mm = mm, - .private = &cp, - }; + if (mmap_write_lock_killable(mm)) { + count = -EINTR; + goto out_mm; + } if (type == CLEAR_REFS_MM_HIWATER_RSS) { - if (down_write_killable(&mm->mmap_sem)) { - count = -EINTR; - goto out_mm; - } - /* * Writing 5 to /proc/pid/clear_refs resets the peak * resident set size to this mm's current rss value. */ reset_mm_hiwater_rss(mm); - up_write(&mm->mmap_sem); - goto out_mm; + goto out_unlock; } - down_read(&mm->mmap_sem); if (type == CLEAR_REFS_SOFT_DIRTY) { - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; - up_read(&mm->mmap_sem); - if (down_write_killable(&mm->mmap_sem)) { - count = -EINTR; - goto out_mm; - } - for (vma = mm->mmap; vma; vma = vma->vm_next) { - vma->vm_flags &= ~VM_SOFTDIRTY; - vma_set_page_prot(vma); - } - downgrade_write(&mm->mmap_sem); - break; + vm_flags_clear(vma, VM_SOFTDIRTY); + vma_set_page_prot(vma); } - mmu_notifier_invalidate_range_start(mm, 0, -1); + + inc_tlb_flush_pending(mm); + mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, + 0, mm, 0, -1UL); + mmu_notifier_invalidate_range_start(&range); + } + walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); + if (type == CLEAR_REFS_SOFT_DIRTY) { + mmu_notifier_invalidate_range_end(&range); + flush_tlb_mm(mm); + dec_tlb_flush_pending(mm); } - walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); - if (type == CLEAR_REFS_SOFT_DIRTY) - mmu_notifier_invalidate_range_end(mm, 0, -1); - flush_tlb_mm(mm); - up_read(&mm->mmap_sem); +out_unlock: + mmap_write_unlock(mm); out_mm: mmput(mm); } @@ -1108,6 +1850,8 @@ struct pagemapread { #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) #define PM_SOFT_DIRTY BIT_ULL(55) #define PM_MMAP_EXCLUSIVE BIT_ULL(56) +#define PM_UFFD_WP BIT_ULL(57) +#define PM_GUARD_REGION BIT_ULL(58) #define PM_FILE BIT_ULL(61) #define PM_SWAP BIT_ULL(62) #define PM_PRESENT BIT_ULL(63) @@ -1119,8 +1863,7 @@ static inline pagemap_entry_t make_pme(u64 frame, u64 flags) return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; } -static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, - struct pagemapread *pm) +static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) { pm->buffer[pm->pos++] = *pme; if (pm->pos >= pm->len) @@ -1128,8 +1871,15 @@ static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, return 0; } +static bool __folio_page_mapped_exclusively(struct folio *folio, struct page *page) +{ + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + return folio_precise_page_mapcount(folio, page) == 1; + return !folio_maybe_mapped_shared(folio); +} + static int pagemap_pte_hole(unsigned long start, unsigned long end, - struct mm_walk *walk) + __always_unused int depth, struct mm_walk *walk) { struct pagemapread *pm = walk->private; unsigned long addr = start; @@ -1147,7 +1897,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, hole_end = end; for (; addr < hole_end; addr += PAGE_SIZE) { - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) goto out; } @@ -1159,7 +1909,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, if (vma->vm_flags & VM_SOFTDIRTY) pme = make_pme(0, PM_SOFT_DIRTY); for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) goto out; } @@ -1173,6 +1923,10 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, { u64 frame = 0, flags = 0; struct page *page = NULL; + struct folio *folio; + + if (pte_none(pte)) + goto out; if (pte_present(pte)) { if (pm->show_pfn) @@ -1181,28 +1935,134 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, page = vm_normal_page(vma, addr, pte); if (pte_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; - } else if (is_swap_pte(pte)) { - swp_entry_t entry; + if (pte_uffd_wp(pte)) + flags |= PM_UFFD_WP; + } else { + softleaf_t entry; + if (pte_swp_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; - entry = pte_to_swp_entry(pte); - frame = swp_type(entry) | - (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + if (pte_swp_uffd_wp(pte)) + flags |= PM_UFFD_WP; + entry = softleaf_from_pte(pte); + if (pm->show_pfn) { + pgoff_t offset; + + /* + * For PFN swap offsets, keeping the offset field + * to be PFN only to be compatible with old smaps. + */ + if (softleaf_has_pfn(entry)) + offset = softleaf_to_pfn(entry); + else + offset = swp_offset(entry); + frame = swp_type(entry) | + (offset << MAX_SWAPFILES_SHIFT); + } flags |= PM_SWAP; - if (is_migration_entry(entry)) - page = migration_entry_to_page(entry); + if (softleaf_has_pfn(entry)) + page = softleaf_to_page(entry); + if (softleaf_is_uffd_wp_marker(entry)) + flags |= PM_UFFD_WP; + if (softleaf_is_guard_marker(entry)) + flags |= PM_GUARD_REGION; + } + + if (page) { + folio = page_folio(page); + if (!folio_test_anon(folio)) + flags |= PM_FILE; + if ((flags & PM_PRESENT) && + __folio_page_mapped_exclusively(folio, page)) + flags |= PM_MMAP_EXCLUSIVE; } - if (page && !PageAnon(page)) - flags |= PM_FILE; - if (page && page_mapcount(page) == 1) - flags |= PM_MMAP_EXCLUSIVE; +out: if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; return make_pme(frame, flags); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, + unsigned long end, struct vm_area_struct *vma, + struct pagemapread *pm) +{ + unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT; + u64 flags = 0, frame = 0; + pmd_t pmd = *pmdp; + struct page *page = NULL; + struct folio *folio = NULL; + int err = 0; + + if (vma->vm_flags & VM_SOFTDIRTY) + flags |= PM_SOFT_DIRTY; + + if (pmd_none(pmd)) + goto populate_pagemap; + + if (pmd_present(pmd)) { + page = pmd_page(pmd); + + flags |= PM_PRESENT; + if (pmd_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; + if (pmd_uffd_wp(pmd)) + flags |= PM_UFFD_WP; + if (pm->show_pfn) + frame = pmd_pfn(pmd) + idx; + } else if (thp_migration_supported()) { + const softleaf_t entry = softleaf_from_pmd(pmd); + unsigned long offset; + + if (pm->show_pfn) { + if (softleaf_has_pfn(entry)) + offset = softleaf_to_pfn(entry) + idx; + else + offset = swp_offset(entry) + idx; + frame = swp_type(entry) | + (offset << MAX_SWAPFILES_SHIFT); + } + flags |= PM_SWAP; + if (pmd_swp_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; + if (pmd_swp_uffd_wp(pmd)) + flags |= PM_UFFD_WP; + VM_WARN_ON_ONCE(!pmd_is_migration_entry(pmd)); + page = softleaf_to_page(entry); + } + + if (page) { + folio = page_folio(page); + if (!folio_test_anon(folio)) + flags |= PM_FILE; + } + +populate_pagemap: + for (; addr != end; addr += PAGE_SIZE, idx++) { + u64 cur_flags = flags; + pagemap_entry_t pme; + + if (folio && (flags & PM_PRESENT) && + __folio_page_mapped_exclusively(folio, page)) + cur_flags |= PM_MMAP_EXCLUSIVE; + + pme = make_pme(frame, cur_flags); + err = add_to_pagemap(&pme, pm); + if (err) + break; + if (pm->show_pfn) { + if (flags & PM_PRESENT) + frame++; + else if (flags & PM_SWAP) + frame += (1 << MAX_SWAPFILES_SHIFT); + } + } + return err; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -1215,57 +2075,26 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, #ifdef CONFIG_TRANSPARENT_HUGEPAGE ptl = pmd_trans_huge_lock(pmdp, vma); if (ptl) { - u64 flags = 0, frame = 0; - pmd_t pmd = *pmdp; - - if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd)) - flags |= PM_SOFT_DIRTY; - - /* - * Currently pmd for thp is always present because thp - * can not be swapped-out, migrated, or HWPOISONed - * (split in such cases instead.) - * This if-check is just to prepare for future implementation. - */ - if (pmd_present(pmd)) { - struct page *page = pmd_page(pmd); - - if (page_mapcount(page) == 1) - flags |= PM_MMAP_EXCLUSIVE; - - flags |= PM_PRESENT; - if (pm->show_pfn) - frame = pmd_pfn(pmd) + - ((addr & ~PMD_MASK) >> PAGE_SHIFT); - } - - for (; addr != end; addr += PAGE_SIZE) { - pagemap_entry_t pme = make_pme(frame, flags); - - err = add_to_pagemap(addr, &pme, pm); - if (err) - break; - if (pm->show_pfn && (flags & PM_PRESENT)) - frame++; - } + err = pagemap_pmd_range_thp(pmdp, addr, end, vma, pm); spin_unlock(ptl); return err; } - - if (pmd_trans_unstable(pmdp)) - return 0; -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif /* * We can assume that @vma always points to a valid one and @end never * goes beyond vma->vm_end. */ orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return err; + } for (; addr < end; pte++, addr += PAGE_SIZE) { pagemap_entry_t pme; - pme = pte_to_pagemap_entry(pm, vma, addr, *pte); - err = add_to_pagemap(addr, &pme, pm); + pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); + err = add_to_pagemap(&pme, pm); if (err) break; } @@ -1285,44 +2114,62 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, struct pagemapread *pm = walk->private; struct vm_area_struct *vma = walk->vma; u64 flags = 0, frame = 0; + spinlock_t *ptl; int err = 0; pte_t pte; if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; - pte = huge_ptep_get(ptep); + ptl = huge_pte_lock(hstate_vma(vma), walk->mm, ptep); + pte = huge_ptep_get(walk->mm, addr, ptep); if (pte_present(pte)) { - struct page *page = pte_page(pte); + struct folio *folio = page_folio(pte_page(pte)); - if (!PageAnon(page)) + if (!folio_test_anon(folio)) flags |= PM_FILE; - if (page_mapcount(page) == 1) + if (!folio_maybe_mapped_shared(folio) && + !hugetlb_pmd_shared(ptep)) flags |= PM_MMAP_EXCLUSIVE; + if (huge_pte_uffd_wp(pte)) + flags |= PM_UFFD_WP; + flags |= PM_PRESENT; if (pm->show_pfn) frame = pte_pfn(pte) + ((addr & ~hmask) >> PAGE_SHIFT); + } else if (pte_swp_uffd_wp_any(pte)) { + flags |= PM_UFFD_WP; } for (; addr != end; addr += PAGE_SIZE) { pagemap_entry_t pme = make_pme(frame, flags); - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) - return err; + break; if (pm->show_pfn && (flags & PM_PRESENT)) frame++; } + spin_unlock(ptl); cond_resched(); return err; } +#else +#define pagemap_hugetlb_range NULL #endif /* HUGETLB_PAGE */ +static const struct mm_walk_ops pagemap_ops = { + .pmd_entry = pagemap_pmd_range, + .pte_hole = pagemap_pte_hole, + .hugetlb_entry = pagemap_hugetlb_range, + .walk_lock = PGWALK_RDLOCK, +}; + /* * /proc/pid/pagemap - an array mapping virtual pages to pfns * @@ -1332,9 +2179,11 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, * Bits 0-54 page frame number (PFN) if present * Bits 0-4 swap type if swapped * Bits 5-54 swap offset if swapped - * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt) + * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) * Bit 56 page exclusively mapped - * Bits 57-60 zero + * Bit 57 pte is uffd-wp write-protected + * Bit 58 pte is a guard region + * Bits 59-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present @@ -1354,7 +2203,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, { struct mm_struct *mm = file->private_data; struct pagemapread pm; - struct mm_walk pagemap_walk = {}; unsigned long src; unsigned long svpfn; unsigned long start_vaddr; @@ -1377,34 +2225,35 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); - pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); + pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL); ret = -ENOMEM; if (!pm.buffer) goto out_mm; - pagemap_walk.pmd_entry = pagemap_pmd_range; - pagemap_walk.pte_hole = pagemap_pte_hole; -#ifdef CONFIG_HUGETLB_PAGE - pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; -#endif - pagemap_walk.mm = mm; - pagemap_walk.private = ± - src = *ppos; svpfn = src / PM_ENTRY_BYTES; - start_vaddr = svpfn << PAGE_SHIFT; end_vaddr = mm->task_size; /* watch out for wraparound */ - if (svpfn > mm->task_size >> PAGE_SHIFT) + start_vaddr = end_vaddr; + if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { + unsigned long end; + + ret = mmap_read_lock_killable(mm); + if (ret) + goto out_free; + start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT); + mmap_read_unlock(mm); + + end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT); + if (end >= start_vaddr && end < mm->task_size) + end_vaddr = end; + } + + /* Ensure the address is inside the task */ + if (start_vaddr > mm->task_size) start_vaddr = end_vaddr; - /* - * The odds are that this will stop walking way - * before end_vaddr, because the length of the - * user buffer is tracked in "pm", and the walk - * will stop when we hit the end of the buffer. - */ ret = 0; while (count && (start_vaddr < end_vaddr)) { int len; @@ -1415,9 +2264,11 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, /* overflow ? */ if (end < start_vaddr || end > end_vaddr) end = end_vaddr; - down_read(&mm->mmap_sem); - ret = walk_page_range(start_vaddr, end, &pagemap_walk); - up_read(&mm->mmap_sem); + ret = mmap_read_lock_killable(mm); + if (ret) + goto out_free; + ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); + mmap_read_unlock(mm); start_vaddr = end; len = min(count, PM_ENTRY_BYTES * pm.pos); @@ -1446,8 +2297,8 @@ static int pagemap_open(struct inode *inode, struct file *file) struct mm_struct *mm; mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(mm)) - return PTR_ERR(mm); + if (IS_ERR_OR_NULL(mm)) + return mm ? PTR_ERR(mm) : -ESRCH; file->private_data = mm; return 0; } @@ -1461,11 +2312,794 @@ static int pagemap_release(struct inode *inode, struct file *file) return 0; } +#define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ + PAGE_IS_FILE | PAGE_IS_PRESENT | \ + PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ + PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \ + PAGE_IS_GUARD) +#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) + +struct pagemap_scan_private { + struct pm_scan_arg arg; + unsigned long masks_of_interest, cur_vma_category; + struct page_region *vec_buf; + unsigned long vec_buf_len, vec_buf_index, found_pages; + struct page_region __user *vec_out; +}; + +static unsigned long pagemap_page_category(struct pagemap_scan_private *p, + struct vm_area_struct *vma, + unsigned long addr, pte_t pte) +{ + unsigned long categories; + + if (pte_none(pte)) + return 0; + + if (pte_present(pte)) { + struct page *page; + + categories = PAGE_IS_PRESENT; + + if (!pte_uffd_wp(pte)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + page = vm_normal_page(vma, addr, pte); + if (page && !PageAnon(page)) + categories |= PAGE_IS_FILE; + } + + if (is_zero_pfn(pte_pfn(pte))) + categories |= PAGE_IS_PFNZERO; + if (pte_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; + } else { + softleaf_t entry; + + categories = PAGE_IS_SWAPPED; + + if (!pte_swp_uffd_wp_any(pte)) + categories |= PAGE_IS_WRITTEN; + + entry = softleaf_from_pte(pte); + if (softleaf_is_guard_marker(entry)) + categories |= PAGE_IS_GUARD; + else if ((p->masks_of_interest & PAGE_IS_FILE) && + softleaf_has_pfn(entry) && + !folio_test_anon(softleaf_to_folio(entry))) + categories |= PAGE_IS_FILE; + + if (pte_swp_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; + } + + return categories; +} + +static void make_uffd_wp_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte, pte_t ptent) +{ + if (pte_present(ptent)) { + pte_t old_pte; + + old_pte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_mkuffd_wp(old_pte); + ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); + } else if (pte_none(ptent)) { + set_pte_at(vma->vm_mm, addr, pte, + make_pte_marker(PTE_MARKER_UFFD_WP)); + } else { + ptent = pte_swp_mkuffd_wp(ptent); + set_pte_at(vma->vm_mm, addr, pte, ptent); + } +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, + struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd) +{ + unsigned long categories = PAGE_IS_HUGE; + + if (pmd_none(pmd)) + return categories; + + if (pmd_present(pmd)) { + struct page *page; + + categories |= PAGE_IS_PRESENT; + if (!pmd_uffd_wp(pmd)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + page = vm_normal_page_pmd(vma, addr, pmd); + if (page && !PageAnon(page)) + categories |= PAGE_IS_FILE; + } + + if (is_huge_zero_pmd(pmd)) + categories |= PAGE_IS_PFNZERO; + if (pmd_soft_dirty(pmd)) + categories |= PAGE_IS_SOFT_DIRTY; + } else { + categories |= PAGE_IS_SWAPPED; + if (!pmd_swp_uffd_wp(pmd)) + categories |= PAGE_IS_WRITTEN; + if (pmd_swp_soft_dirty(pmd)) + categories |= PAGE_IS_SOFT_DIRTY; + + if (p->masks_of_interest & PAGE_IS_FILE) { + const softleaf_t entry = softleaf_from_pmd(pmd); + + if (softleaf_has_pfn(entry) && + !folio_test_anon(softleaf_to_folio(entry))) + categories |= PAGE_IS_FILE; + } + } + + return categories; +} + +static void make_uffd_wp_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old, pmd = *pmdp; + + if (pmd_present(pmd)) { + old = pmdp_invalidate_ad(vma, addr, pmdp); + pmd = pmd_mkuffd_wp(old); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } else if (pmd_is_migration_entry(pmd)) { + pmd = pmd_swp_mkuffd_wp(pmd); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_HUGETLB_PAGE +static unsigned long pagemap_hugetlb_category(pte_t pte) +{ + unsigned long categories = PAGE_IS_HUGE; + + if (pte_none(pte)) + return categories; + + /* + * According to pagemap_hugetlb_range(), file-backed HugeTLB + * page cannot be swapped. So PAGE_IS_FILE is not checked for + * swapped pages. + */ + if (pte_present(pte)) { + categories |= PAGE_IS_PRESENT; + + if (!huge_pte_uffd_wp(pte)) + categories |= PAGE_IS_WRITTEN; + if (!PageAnon(pte_page(pte))) + categories |= PAGE_IS_FILE; + if (is_zero_pfn(pte_pfn(pte))) + categories |= PAGE_IS_PFNZERO; + if (pte_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; + } else { + categories |= PAGE_IS_SWAPPED; + + if (!pte_swp_uffd_wp_any(pte)) + categories |= PAGE_IS_WRITTEN; + if (pte_swp_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; + } + + return categories; +} + +static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t ptent) +{ + const unsigned long psize = huge_page_size(hstate_vma(vma)); + softleaf_t entry; + + if (huge_pte_none(ptent)) { + set_huge_pte_at(vma->vm_mm, addr, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP), psize); + return; + } + + entry = softleaf_from_pte(ptent); + if (softleaf_is_hwpoison(entry) || softleaf_is_marker(entry)) + return; + + if (softleaf_is_migration(entry)) + set_huge_pte_at(vma->vm_mm, addr, ptep, + pte_swp_mkuffd_wp(ptent), psize); + else + huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, + huge_pte_mkuffd_wp(ptent)); +} +#endif /* CONFIG_HUGETLB_PAGE */ + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) +static void pagemap_scan_backout_range(struct pagemap_scan_private *p, + unsigned long addr, unsigned long end) +{ + struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; + + if (!p->vec_buf) + return; + + if (cur_buf->start != addr) + cur_buf->end = addr; + else + cur_buf->start = cur_buf->end = 0; + + p->found_pages -= (end - addr) / PAGE_SIZE; +} +#endif + +static bool pagemap_scan_is_interesting_page(unsigned long categories, + const struct pagemap_scan_private *p) +{ + categories ^= p->arg.category_inverted; + if ((categories & p->arg.category_mask) != p->arg.category_mask) + return false; + if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask)) + return false; + + return true; +} + +static bool pagemap_scan_is_interesting_vma(unsigned long categories, + const struct pagemap_scan_private *p) +{ + unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED; + + categories ^= p->arg.category_inverted; + if ((categories & required) != required) + return false; + + return true; +} + +static int pagemap_scan_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long vma_category = 0; + bool wp_allowed = userfaultfd_wp_async(vma) && + userfaultfd_wp_use_markers(vma); + + if (!wp_allowed) { + /* User requested explicit failure over wp-async capability */ + if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) + return -EPERM; + /* + * User requires wr-protect, and allows silently skipping + * unsupported vmas. + */ + if (p->arg.flags & PM_SCAN_WP_MATCHING) + return 1; + /* + * Then the request doesn't involve wr-protects at all, + * fall through to the rest checks, and allow vma walk. + */ + } + + if (vma->vm_flags & VM_PFNMAP) + return 1; + + if (wp_allowed) + vma_category |= PAGE_IS_WPALLOWED; + + if (vma->vm_flags & VM_SOFTDIRTY) + vma_category |= PAGE_IS_SOFT_DIRTY; + + if (!pagemap_scan_is_interesting_vma(vma_category, p)) + return 1; + + p->cur_vma_category = vma_category; + + return 0; +} + +static bool pagemap_scan_push_range(unsigned long categories, + struct pagemap_scan_private *p, + unsigned long addr, unsigned long end) +{ + struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; + + /* + * When there is no output buffer provided at all, the sentinel values + * won't match here. There is no other way for `cur_buf->end` to be + * non-zero other than it being non-empty. + */ + if (addr == cur_buf->end && categories == cur_buf->categories) { + cur_buf->end = end; + return true; + } + + if (cur_buf->end) { + if (p->vec_buf_index >= p->vec_buf_len - 1) + return false; + + cur_buf = &p->vec_buf[++p->vec_buf_index]; + } + + cur_buf->start = addr; + cur_buf->end = end; + cur_buf->categories = categories; + + return true; +} + +static int pagemap_scan_output(unsigned long categories, + struct pagemap_scan_private *p, + unsigned long addr, unsigned long *end) +{ + unsigned long n_pages, total_pages; + int ret = 0; + + if (!p->vec_buf) + return 0; + + categories &= p->arg.return_mask; + + n_pages = (*end - addr) / PAGE_SIZE; + if (check_add_overflow(p->found_pages, n_pages, &total_pages) || + total_pages > p->arg.max_pages) { + size_t n_too_much = total_pages - p->arg.max_pages; + *end -= n_too_much * PAGE_SIZE; + n_pages -= n_too_much; + ret = -ENOSPC; + } + + if (!pagemap_scan_push_range(categories, p, addr, *end)) { + *end = addr; + n_pages = 0; + ret = -ENOSPC; + } + + p->found_pages += n_pages; + if (ret) + p->arg.walk_end = *end; + + return ret; +} + +static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long categories; + spinlock_t *ptl; + int ret = 0; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return -ENOENT; + + categories = p->cur_vma_category | + pagemap_thp_category(p, vma, start, *pmd); + + if (!pagemap_scan_is_interesting_page(categories, p)) + goto out_unlock; + + ret = pagemap_scan_output(categories, p, start, &end); + if (start == end) + goto out_unlock; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + goto out_unlock; + if (~categories & PAGE_IS_WRITTEN) + goto out_unlock; + + /* + * Break huge page into small pages if the WP operation + * needs to be performed on a portion of the huge page. + */ + if (end != start + HPAGE_SIZE) { + spin_unlock(ptl); + split_huge_pmd(vma, pmd, start); + pagemap_scan_backout_range(p, start, end); + /* Report as if there was no THP */ + return -ENOENT; + } + + make_uffd_wp_pmd(vma, start, pmd); + flush_tlb_range(vma, start, end); +out_unlock: + spin_unlock(ptl); + return ret; +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ + return -ENOENT; +#endif +} + +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long addr, flush_end = 0; + pte_t *pte, *start_pte; + spinlock_t *ptl; + int ret; + + ret = pagemap_scan_thp_entry(pmd, start, end, walk); + if (ret != -ENOENT) + return ret; + + ret = 0; + start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return 0; + } + + arch_enter_lazy_mmu_mode(); + + if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { + /* Fast path for performing exclusive WP */ + for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { + pte_t ptent = ptep_get(pte); + + if ((pte_present(ptent) && pte_uffd_wp(ptent)) || + pte_swp_uffd_wp_any(ptent)) + continue; + make_uffd_wp_pte(vma, addr, pte, ptent); + if (!flush_end) + start = addr; + flush_end = addr + PAGE_SIZE; + } + goto flush_and_return; + } + + if (!p->arg.category_anyof_mask && !p->arg.category_inverted && + p->arg.category_mask == PAGE_IS_WRITTEN && + p->arg.return_mask == PAGE_IS_WRITTEN) { + for (addr = start; addr < end; pte++, addr += PAGE_SIZE) { + unsigned long next = addr + PAGE_SIZE; + pte_t ptent = ptep_get(pte); + + if ((pte_present(ptent) && pte_uffd_wp(ptent)) || + pte_swp_uffd_wp_any(ptent)) + continue; + ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN, + p, addr, &next); + if (next == addr) + break; + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + continue; + make_uffd_wp_pte(vma, addr, pte, ptent); + if (!flush_end) + start = addr; + flush_end = next; + } + goto flush_and_return; + } + + for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { + pte_t ptent = ptep_get(pte); + unsigned long categories = p->cur_vma_category | + pagemap_page_category(p, vma, addr, ptent); + unsigned long next = addr + PAGE_SIZE; + + if (!pagemap_scan_is_interesting_page(categories, p)) + continue; + + ret = pagemap_scan_output(categories, p, addr, &next); + if (next == addr) + break; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + continue; + if (~categories & PAGE_IS_WRITTEN) + continue; + + make_uffd_wp_pte(vma, addr, pte, ptent); + if (!flush_end) + start = addr; + flush_end = next; + } + +flush_and_return: + if (flush_end) + flush_tlb_range(vma, start, addr); + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(start_pte, ptl); + + cond_resched(); + return ret; +} + +#ifdef CONFIG_HUGETLB_PAGE +static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, + unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long categories; + spinlock_t *ptl; + int ret = 0; + pte_t pte; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) { + /* Go the short route when not write-protecting pages. */ + + pte = huge_ptep_get(walk->mm, start, ptep); + categories = p->cur_vma_category | pagemap_hugetlb_category(pte); + + if (!pagemap_scan_is_interesting_page(categories, p)) + return 0; + + return pagemap_scan_output(categories, p, start, &end); + } + + i_mmap_lock_write(vma->vm_file->f_mapping); + ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); + + pte = huge_ptep_get(walk->mm, start, ptep); + categories = p->cur_vma_category | pagemap_hugetlb_category(pte); + + if (!pagemap_scan_is_interesting_page(categories, p)) + goto out_unlock; + + ret = pagemap_scan_output(categories, p, start, &end); + if (start == end) + goto out_unlock; + + if (~categories & PAGE_IS_WRITTEN) + goto out_unlock; + + if (end != start + HPAGE_SIZE) { + /* Partial HugeTLB page WP isn't possible. */ + pagemap_scan_backout_range(p, start, end); + p->arg.walk_end = start; + ret = 0; + goto out_unlock; + } + + make_uffd_wp_huge_pte(vma, start, ptep, pte); + flush_hugetlb_tlb_range(vma, start, end); + +out_unlock: + spin_unlock(ptl); + i_mmap_unlock_write(vma->vm_file->f_mapping); + + return ret; +} +#else +#define pagemap_scan_hugetlb_entry NULL +#endif + +static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, + int depth, struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + int ret, err; + + if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p)) + return 0; + + ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end); + if (addr == end) + return ret; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + return ret; + + err = uffd_wp_range(vma, addr, end - addr, true); + if (err < 0) + ret = err; + + return ret; +} + +static const struct mm_walk_ops pagemap_scan_ops = { + .test_walk = pagemap_scan_test_walk, + .pmd_entry = pagemap_scan_pmd_entry, + .pte_hole = pagemap_scan_pte_hole, + .hugetlb_entry = pagemap_scan_hugetlb_entry, +}; + +static int pagemap_scan_get_args(struct pm_scan_arg *arg, + unsigned long uarg) +{ + if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg))) + return -EFAULT; + + if (arg->size != sizeof(struct pm_scan_arg)) + return -EINVAL; + + /* Validate requested features */ + if (arg->flags & ~PM_SCAN_FLAGS) + return -EINVAL; + if ((arg->category_inverted | arg->category_mask | + arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES) + return -EINVAL; + + arg->start = untagged_addr((unsigned long)arg->start); + arg->end = untagged_addr((unsigned long)arg->end); + arg->vec = untagged_addr((unsigned long)arg->vec); + + /* Validate memory pointers */ + if (!IS_ALIGNED(arg->start, PAGE_SIZE)) + return -EINVAL; + if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start)) + return -EFAULT; + if (!arg->vec && arg->vec_len) + return -EINVAL; + if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX) + return -EINVAL; + if (arg->vec && !access_ok((void __user *)(long)arg->vec, + size_mul(arg->vec_len, sizeof(struct page_region)))) + return -EFAULT; + + /* Fixup default values */ + arg->end = ALIGN(arg->end, PAGE_SIZE); + arg->walk_end = 0; + if (!arg->max_pages) + arg->max_pages = ULONG_MAX; + + return 0; +} + +static int pagemap_scan_writeback_args(struct pm_scan_arg *arg, + unsigned long uargl) +{ + struct pm_scan_arg __user *uarg = (void __user *)uargl; + + if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end))) + return -EFAULT; + + return 0; +} + +static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) +{ + if (!p->arg.vec_len) + return 0; + + p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT, + p->arg.vec_len); + p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf), + GFP_KERNEL); + if (!p->vec_buf) + return -ENOMEM; + + p->vec_buf->start = p->vec_buf->end = 0; + p->vec_out = (struct page_region __user *)(long)p->arg.vec; + + return 0; +} + +static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p) +{ + const struct page_region *buf = p->vec_buf; + long n = p->vec_buf_index; + + if (!p->vec_buf) + return 0; + + if (buf[n].end != buf[n].start) + n++; + + if (!n) + return 0; + + if (copy_to_user(p->vec_out, buf, n * sizeof(*buf))) + return -EFAULT; + + p->arg.vec_len -= n; + p->vec_out += n; + + p->vec_buf_index = 0; + p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len); + p->vec_buf->start = p->vec_buf->end = 0; + + return n; +} + +static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) +{ + struct pagemap_scan_private p = {0}; + unsigned long walk_start; + size_t n_ranges_out = 0; + int ret; + + ret = pagemap_scan_get_args(&p.arg, uarg); + if (ret) + return ret; + + p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask | + p.arg.return_mask; + ret = pagemap_scan_init_bounce_buffer(&p); + if (ret) + return ret; + + for (walk_start = p.arg.start; walk_start < p.arg.end; + walk_start = p.arg.walk_end) { + struct mmu_notifier_range range; + long n_out; + + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + ret = mmap_read_lock_killable(mm); + if (ret) + break; + + /* Protection change for the range is going to happen. */ + if (p.arg.flags & PM_SCAN_WP_MATCHING) { + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, + mm, walk_start, p.arg.end); + mmu_notifier_invalidate_range_start(&range); + } + + ret = walk_page_range(mm, walk_start, p.arg.end, + &pagemap_scan_ops, &p); + + if (p.arg.flags & PM_SCAN_WP_MATCHING) + mmu_notifier_invalidate_range_end(&range); + + mmap_read_unlock(mm); + + n_out = pagemap_scan_flush_buffer(&p); + if (n_out < 0) + ret = n_out; + else + n_ranges_out += n_out; + + if (ret != -ENOSPC) + break; + + if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages) + break; + } + + /* ENOSPC signifies early stop (buffer full) from the walk. */ + if (!ret || ret == -ENOSPC) + ret = n_ranges_out; + + /* The walk_end isn't set when ret is zero */ + if (!p.arg.walk_end) + p.arg.walk_end = p.arg.end; + if (pagemap_scan_writeback_args(&p.arg, uarg)) + ret = -EFAULT; + + kfree(p.vec_buf); + return ret; +} + +static long do_pagemap_cmd(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct mm_struct *mm = file->private_data; + + switch (cmd) { + case PAGEMAP_SCAN: + return do_pagemap_scan(mm, arg); + + default: + return -EINVAL; + } +} + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, .open = pagemap_open, .release = pagemap_release, + .unlocked_ioctl = do_pagemap_cmd, + .compat_ioctl = do_pagemap_cmd, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ @@ -1490,28 +3124,34 @@ struct numa_maps_private { static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, unsigned long nr_pages) { - int count = page_mapcount(page); + struct folio *folio = page_folio(page); + int count; + + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + count = folio_precise_page_mapcount(folio, page); + else + count = folio_average_page_mapcount(folio); md->pages += nr_pages; - if (pte_dirty || PageDirty(page)) + if (pte_dirty || folio_test_dirty(folio)) md->dirty += nr_pages; - if (PageSwapCache(page)) + if (folio_test_swapcache(folio)) md->swapcache += nr_pages; - if (PageActive(page) || PageUnevictable(page)) + if (folio_test_active(folio) || folio_test_unevictable(folio)) md->active += nr_pages; - if (PageWriteback(page)) + if (folio_test_writeback(folio)) md->writeback += nr_pages; - if (PageAnon(page)) + if (folio_test_anon(folio)) md->anon += nr_pages; if (count > md->mapcount_max) md->mapcount_max = count; - md->node[page_to_nid(page)] += nr_pages; + md->node[folio_nid(folio)] += nr_pages; } static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, @@ -1524,7 +3164,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, return NULL; page = vm_normal_page(vma, addr, pte); - if (!page) + if (!page || is_zone_device_page(page)) return NULL; if (PageReserved(page)) @@ -1584,16 +3224,18 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, spin_unlock(ptl); return 0; } - - if (pmd_trans_unstable(pmd)) - return 0; #endif orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return 0; + } do { - struct page *page = can_gather_numa_stats(*pte, vma, addr); + pte_t ptent = ptep_get(pte); + struct page *page = can_gather_numa_stats(ptent, vma, addr); if (!page) continue; - gather_stats(page, md, pte_dirty(*pte), 1); + gather_stats(page, md, pte_dirty(ptent), 1); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); @@ -1604,19 +3246,22 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { - pte_t huge_pte = huge_ptep_get(pte); + pte_t huge_pte; struct numa_maps *md; struct page *page; + spinlock_t *ptl; + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); + huge_pte = huge_ptep_get(walk->mm, addr, pte); if (!pte_present(huge_pte)) - return 0; + goto out; page = pte_page(huge_pte); - if (!page) - return 0; md = walk->private; gather_stats(page, md, pte_dirty(huge_pte), 1); +out: + spin_unlock(ptl); return 0; } @@ -1628,10 +3273,16 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, } #endif +static const struct mm_walk_ops show_numa_ops = { + .hugetlb_entry = gather_hugetlb_stats, + .pmd_entry = gather_pte_stats, + .walk_lock = PGWALK_RDLOCK, +}; + /* * Display pages allocated per node and memory policy via /proc. */ -static int show_numa_map(struct seq_file *m, void *v, int is_pid) +static int show_numa_map(struct seq_file *m, void *v) { struct numa_maps_private *numa_priv = m->private; struct proc_maps_private *proc_priv = &numa_priv->proc_maps; @@ -1639,14 +3290,9 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; - struct mm_walk walk = { - .hugetlb_entry = gather_hugetlb_stats, - .pmd_entry = gather_pte_stats, - .private = md, - .mm = mm, - }; - struct mempolicy *pol; char buffer[64]; + struct mempolicy *pol; + pgoff_t ilx; int nid; if (!mm) @@ -1655,7 +3301,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) /* Ensure we start with an empty set of numa_maps statistics. */ memset(md, 0, sizeof(*md)); - pol = __get_vma_policy(vma, vma->vm_start); + pol = __get_vma_policy(vma, vma->vm_start, &ilx); if (pol) { mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); @@ -1667,18 +3313,18 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) if (file) { seq_puts(m, " file="); - seq_file_path(m, file, "\n\t= "); - } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_path(m, file_user_path(file), "\n\t= "); + } else if (vma_is_initial_heap(vma)) { seq_puts(m, " heap"); - } else if (is_stack(proc_priv, vma)) { + } else if (vma_is_initial_stack(vma)) { seq_puts(m, " stack"); } if (is_vm_hugetlb_page(vma)) seq_puts(m, " huge"); - /* mmap_sem is held by m_start */ - walk_page_vma(vma, &walk); + /* mmap_lock is held by m_start */ + walk_page_vma(vma, &show_numa_ops, md); if (!md->pages) goto out; @@ -1711,49 +3357,20 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10); out: seq_putc(m, '\n'); - m_cache_vma(m, vma); return 0; } -static int show_pid_numa_map(struct seq_file *m, void *v) -{ - return show_numa_map(m, v, 1); -} - -static int show_tid_numa_map(struct seq_file *m, void *v) -{ - return show_numa_map(m, v, 0); -} - static const struct seq_operations proc_pid_numa_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_pid_numa_map, + .show = show_numa_map, }; -static const struct seq_operations proc_tid_numa_maps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_tid_numa_map, -}; - -static int numa_maps_open(struct inode *inode, struct file *file, - const struct seq_operations *ops) -{ - return proc_maps_open(inode, file, ops, - sizeof(struct numa_maps_private)); -} - static int pid_numa_maps_open(struct inode *inode, struct file *file) { - return numa_maps_open(inode, file, &proc_pid_numa_maps_op); -} - -static int tid_numa_maps_open(struct inode *inode, struct file *file) -{ - return numa_maps_open(inode, file, &proc_tid_numa_maps_op); + return proc_maps_open(inode, file, &proc_pid_numa_maps_op, + sizeof(struct numa_maps_private)); } const struct file_operations proc_pid_numa_maps_operations = { @@ -1763,10 +3380,4 @@ const struct file_operations proc_pid_numa_maps_operations = { .release = proc_map_release, }; -const struct file_operations proc_tid_numa_maps_operations = { - .open = tid_numa_maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = proc_map_release, -}; #endif /* CONFIG_NUMA */ diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 23266694db11..d362919f4f68 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/file.h> @@ -19,15 +20,13 @@ */ void task_mem(struct seq_file *m, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; struct vm_region *region; - struct rb_node *p; unsigned long bytes = 0, sbytes = 0, slack = 0, size; - - down_read(&mm->mmap_sem); - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { - vma = rb_entry(p, struct vm_area_struct, vm_rb); + mmap_read_lock(mm); + for_each_vma(vmi, vma) { bytes += kobjsize(vma); region = vma->vm_region; @@ -39,7 +38,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) } if (atomic_read(&mm->mm_count) > 1 || - vma->vm_flags & VM_MAYSHARE) { + is_nommu_shared_mapping(vma->vm_flags)) { sbytes += size; } else { bytes += size; @@ -52,7 +51,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) sbytes += kobjsize(mm); else bytes += kobjsize(mm); - + if (current->fs && current->fs->users > 1) sbytes += kobjsize(current->fs); else @@ -63,34 +62,32 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) else bytes += kobjsize(current->files); - if (current->sighand && atomic_read(¤t->sighand->count) > 1) + if (current->sighand && refcount_read(¤t->sighand->count) > 1) sbytes += kobjsize(current->sighand); else bytes += kobjsize(current->sighand); bytes += kobjsize(current); /* includes kernel stack */ + mmap_read_unlock(mm); + seq_printf(m, "Mem:\t%8lu bytes\n" "Slack:\t%8lu bytes\n" "Shared:\t%8lu bytes\n", bytes, slack, sbytes); - - up_read(&mm->mmap_sem); } unsigned long task_vsize(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - struct rb_node *p; unsigned long vsize = 0; - down_read(&mm->mmap_sem); - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { - vma = rb_entry(p, struct vm_area_struct, vm_rb); + mmap_read_lock(mm); + for_each_vma(vmi, vma) vsize += vma->vm_end - vma->vm_start; - } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return vsize; } @@ -98,14 +95,13 @@ unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; struct vm_region *region; - struct rb_node *p; unsigned long size = kobjsize(mm); - down_read(&mm->mmap_sem); - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { - vma = rb_entry(p, struct vm_area_struct, vm_rb); + mmap_read_lock(mm); + for_each_vma(vmi, vma) { size += kobjsize(vma); region = vma->vm_region; if (region) { @@ -118,35 +114,19 @@ unsigned long task_statm(struct mm_struct *mm, >> PAGE_SHIFT; *data = (PAGE_ALIGN(mm->start_stack) - (mm->start_data & PAGE_MASK)) >> PAGE_SHIFT; - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); size >>= PAGE_SHIFT; size += *text + *data; *resident = size; return size; } -static int is_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma) -{ - struct mm_struct *mm = vma->vm_mm; - - /* - * We make no effort to guess what a given thread considers to be - * its "stack". It's not even well-defined for programs written - * languages like Go. - */ - return vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack; -} - /* * display a single VMA to a sequenced file */ -static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, - int is_pid) +static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; - struct proc_maps_private *priv = m->private; unsigned long ino = 0; struct file *file; dev_t dev = 0; @@ -177,10 +157,10 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, if (file) { seq_pad(m, ' '); - seq_file_path(m, file, ""); - } else if (mm && is_stack(priv, vma)) { + seq_path(m, file_user_path(file), ""); + } else if (mm && vma_is_initial_stack(vma)) { seq_pad(m, ' '); - seq_printf(m, "[stack]"); + seq_puts(m, "[stack]"); } seq_putc(m, '\n'); @@ -190,85 +170,83 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, /* * display mapping lines for a particular process's /proc/pid/maps */ -static int show_map(struct seq_file *m, void *_p, int is_pid) +static int show_map(struct seq_file *m, void *_p) { - struct rb_node *p = _p; - - return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb), - is_pid); + return nommu_vma_show(m, _p); } -static int show_pid_map(struct seq_file *m, void *_p) +static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, + loff_t *ppos) { - return show_map(m, _p, 1); -} + struct vm_area_struct *vma = vma_next(&priv->iter); -static int show_tid_map(struct seq_file *m, void *_p) -{ - return show_map(m, _p, 0); + if (vma) { + *ppos = vma->vm_start; + } else { + *ppos = -1UL; + } + + return vma; } -static void *m_start(struct seq_file *m, loff_t *pos) +static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; + unsigned long last_addr = *ppos; struct mm_struct *mm; - struct rb_node *p; - loff_t n = *pos; + + /* See proc_get_vma(). Zero at the start or after lseek. */ + if (last_addr == -1UL) + return NULL; /* pin the task and mm whilst we play with them */ priv->task = get_proc_task(priv->inode); if (!priv->task) return ERR_PTR(-ESRCH); - mm = priv->mm; - if (!mm || !mmget_not_zero(mm)) + mm = priv->lock_ctx.mm; + if (!mm || !mmget_not_zero(mm)) { + put_task_struct(priv->task); + priv->task = NULL; return NULL; + } + + if (mmap_read_lock_killable(mm)) { + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; + return ERR_PTR(-EINTR); + } - down_read(&mm->mmap_sem); - /* start from the Nth VMA */ - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) - if (n-- == 0) - return p; + vma_iter_init(&priv->iter, mm, last_addr); - up_read(&mm->mmap_sem); - mmput(mm); - return NULL; + return proc_get_vma(priv, ppos); } -static void m_stop(struct seq_file *m, void *_vml) +static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; + struct mm_struct *mm = priv->lock_ctx.mm; - if (!IS_ERR_OR_NULL(_vml)) { - up_read(&priv->mm->mmap_sem); - mmput(priv->mm); - } - if (priv->task) { - put_task_struct(priv->task); - priv->task = NULL; - } + if (!priv->task) + return; + + mmap_read_unlock(mm); + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; } -static void *m_next(struct seq_file *m, void *_p, loff_t *pos) +static void *m_next(struct seq_file *m, void *_p, loff_t *ppos) { - struct rb_node *p = _p; - - (*pos)++; - return p ? rb_next(p) : NULL; + return proc_get_vma(m->private, ppos); } static const struct seq_operations proc_pid_maps_ops = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_pid_map -}; - -static const struct seq_operations proc_tid_maps_ops = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_tid_map + .show = show_map }; static int maps_open(struct inode *inode, struct file *file, @@ -281,9 +259,9 @@ static int maps_open(struct inode *inode, struct file *file, return -ENOMEM; priv->inode = inode; - priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - int err = PTR_ERR(priv->mm); + priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) { + int err = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH; seq_release_private(inode, file); return err; @@ -298,8 +276,8 @@ static int map_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; - if (priv->mm) - mmdrop(priv->mm); + if (priv->lock_ctx.mm) + mmdrop(priv->lock_ctx.mm); return seq_release_private(inode, file); } @@ -309,11 +287,6 @@ static int pid_maps_open(struct inode *inode, struct file *file) return maps_open(inode, file, &proc_pid_maps_ops); } -static int tid_maps_open(struct inode *inode, struct file *file) -{ - return maps_open(inode, file, &proc_tid_maps_ops); -} - const struct file_operations proc_pid_maps_operations = { .open = pid_maps_open, .read = seq_read, @@ -321,10 +294,3 @@ const struct file_operations proc_pid_maps_operations = { .release = map_release, }; -const struct file_operations proc_tid_maps_operations = { - .open = tid_maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = map_release, -}; - diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 20614b62a9b7..d6113dbe58e0 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cache.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/pid_namespace.h> @@ -10,18 +12,17 @@ static const char *proc_thread_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct pid_namespace *ns = inode->i_sb->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); pid_t tgid = task_tgid_nr_ns(current, ns); pid_t pid = task_pid_nr_ns(current, ns); char *name; if (!pid) return ERR_PTR(-ENOENT); - name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, - dentry ? GFP_KERNEL : GFP_ATOMIC); + name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); if (unlikely(!name)) return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); - sprintf(name, "%d/task/%d", tgid, pid); + sprintf(name, "%u/task/%u", tgid, pid); set_delayed_call(done, kfree_link, name); return name; } @@ -30,40 +31,35 @@ static const struct inode_operations proc_thread_self_inode_operations = { .get_link = proc_thread_self_get_link, }; -static unsigned thread_self_inum; +unsigned thread_self_inum __ro_after_init; int proc_setup_thread_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct pid_namespace *ns = s->s_fs_info; struct dentry *thread_self; + int ret = -ENOMEM; inode_lock(root_inode); thread_self = d_alloc_name(s->s_root, "thread-self"); if (thread_self) { - struct inode *inode = new_inode_pseudo(s); + struct inode *inode = new_inode(s); if (inode) { inode->i_ino = thread_self_inum; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_thread_self_inode_operations; - d_add(thread_self, inode); - } else { - dput(thread_self); - thread_self = ERR_PTR(-ENOMEM); + d_make_persistent(thread_self, inode); + ret = 0; } - } else { - thread_self = ERR_PTR(-ENOMEM); + dput(thread_self); } inode_unlock(root_inode); - if (IS_ERR(thread_self)) { - pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); - return PTR_ERR(thread_self); - } - ns->proc_thread_self = thread_self; - return 0; + + if (ret) + pr_err("proc_fill_super: can't allocate /proc/thread-self\n"); + return ret; } void __init proc_thread_self_init(void) diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 7981c4ffe787..b5343d209381 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -1,25 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/time.h> +#include <linux/time_namespace.h> #include <linux/kernel_stat.h> +#include "internal.h" static int uptime_proc_show(struct seq_file *m, void *v) { - struct timespec uptime; - struct timespec idle; - u64 nsec; + struct timespec64 uptime; + struct timespec64 idle; + u64 idle_nsec; u32 rem; int i; - nsec = 0; - for_each_possible_cpu(i) - nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; + idle_nsec = 0; + for_each_possible_cpu(i) { + struct kernel_cpustat kcs; - get_monotonic_boottime(&uptime); - idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + kcpustat_cpu_fetch(&kcs, i); + idle_nsec += get_idle_time(&kcs, i); + } + + ktime_get_boottime_ts64(&uptime); + timens_add_boottime(&uptime); + + idle.tv_sec = div_u64_rem(idle_nsec, NSEC_PER_SEC, &rem); idle.tv_nsec = rem; seq_printf(m, "%lu.%02lu %lu.%02lu\n", (unsigned long) uptime.tv_sec, @@ -29,21 +38,12 @@ static int uptime_proc_show(struct seq_file *m, void *v) return 0; } -static int uptime_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, uptime_proc_show, NULL); -} - -static const struct file_operations uptime_proc_fops = { - .open = uptime_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_uptime_init(void) { - proc_create("uptime", 0, NULL, &uptime_proc_fops); + struct proc_dir_entry *pde; + + pde = proc_create_single("uptime", 0, NULL, uptime_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_uptime_init); diff --git a/fs/proc/util.c b/fs/proc/util.c new file mode 100644 index 000000000000..98f8adc17345 --- /dev/null +++ b/fs/proc/util.c @@ -0,0 +1,24 @@ +#include <linux/dcache.h> +#include "internal.h" + +unsigned name_to_int(const struct qstr *qstr) +{ + const char *name = qstr->name; + int len = qstr->len; + unsigned n = 0; + + if (len > 1 && *name == '0') + goto out; + do { + unsigned c = *name++ - '0'; + if (c > 9) + goto out; + if (n >= (~0U-9)/10) + goto out; + n *= 10; + n += c; + } while (--len > 0); + return n; +out: + return ~0U; +} diff --git a/fs/proc/version.c b/fs/proc/version.c index d2154eb6d78f..02e3c3cd4a9a 100644 --- a/fs/proc/version.c +++ b/fs/proc/version.c @@ -1,9 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/utsname.h> +#include "internal.h" static int version_proc_show(struct seq_file *m, void *v) { @@ -14,21 +16,12 @@ static int version_proc_show(struct seq_file *m, void *v) return 0; } -static int version_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, version_proc_show, NULL); -} - -static const struct file_operations version_proc_fops = { - .open = version_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_version_init(void) { - proc_create("version", 0, NULL, &version_proc_fops); + struct proc_dir_entry *pde; + + pde = proc_create_single("version", 0, NULL, version_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_version_init); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 885d445afa0d..f188bd900eb2 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * fs/proc/vmcore.c Interface for accessing the crash * dump from the system's previous life. @@ -7,6 +8,8 @@ * */ +#define pr_fmt(fmt) "vmcore: " fmt + #include <linux/mm.h> #include <linux/kcore.h> #include <linux/user.h> @@ -16,13 +19,16 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/printk.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/init.h> #include <linux/crash_dump.h> #include <linux/list.h> +#include <linux/moduleparam.h> +#include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> -#include <linux/uaccess.h> +#include <linux/uio.h> +#include <linux/cc_platform.h> #include <asm/io.h> #include "internal.h" @@ -38,59 +44,125 @@ static size_t elfcorebuf_sz_orig; static char *elfnotes_buf; static size_t elfnotes_sz; +/* Size of all notes minus the device dump notes */ +static size_t elfnotes_orig_sz; /* Total size of vmcore file. */ static u64 vmcore_size; static struct proc_dir_entry *proc_vmcore; -/* - * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error - * The called function has to take care of module refcounting. - */ -static int (*oldmem_pfn_is_ram)(unsigned long pfn); +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +struct vmcoredd_node { + struct list_head list; /* List of dumps */ + void *buf; /* Buffer containing device's dump */ + unsigned int size; /* Size of the buffer */ +}; + +/* Device Dump list and mutex to synchronize access to list */ +static LIST_HEAD(vmcoredd_list); + +static bool vmcoredd_disabled; +core_param(novmcoredd, vmcoredd_disabled, bool, 0); +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + +/* Device Dump Size */ +static size_t vmcoredd_orig_sz; + +static DEFINE_MUTEX(vmcore_mutex); + +DEFINE_STATIC_SRCU(vmcore_cb_srcu); +/* List of registered vmcore callbacks. */ +static LIST_HEAD(vmcore_cb_list); +/* Whether the vmcore has been opened once. */ +static bool vmcore_opened; +/* Whether the vmcore is currently open. */ +static unsigned int vmcore_open; -int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn)) +static void vmcore_process_device_ram(struct vmcore_cb *cb); + +void register_vmcore_cb(struct vmcore_cb *cb) { - if (oldmem_pfn_is_ram) - return -EBUSY; - oldmem_pfn_is_ram = fn; - return 0; + INIT_LIST_HEAD(&cb->next); + mutex_lock(&vmcore_mutex); + list_add_tail(&cb->next, &vmcore_cb_list); + /* + * Registering a vmcore callback after the vmcore was opened is + * very unusual (e.g., manual driver loading). + */ + if (vmcore_opened) + pr_warn_once("Unexpected vmcore callback registration\n"); + if (!vmcore_open && cb->get_device_ram) + vmcore_process_device_ram(cb); + mutex_unlock(&vmcore_mutex); } -EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram); +EXPORT_SYMBOL_GPL(register_vmcore_cb); -void unregister_oldmem_pfn_is_ram(void) +void unregister_vmcore_cb(struct vmcore_cb *cb) { - oldmem_pfn_is_ram = NULL; - wmb(); + mutex_lock(&vmcore_mutex); + list_del_rcu(&cb->next); + /* + * Unregistering a vmcore callback after the vmcore was opened is + * very unusual (e.g., forced driver removal), but we cannot stop + * unregistering. + */ + if (vmcore_opened) + pr_warn_once("Unexpected vmcore callback unregistration\n"); + mutex_unlock(&vmcore_mutex); + + synchronize_srcu(&vmcore_cb_srcu); } -EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram); +EXPORT_SYMBOL_GPL(unregister_vmcore_cb); -static int pfn_is_ram(unsigned long pfn) +static bool pfn_is_ram(unsigned long pfn) { - int (*fn)(unsigned long pfn); - /* pfn is ram unless fn() checks pagetype */ - int ret = 1; + struct vmcore_cb *cb; + bool ret = true; - /* - * Ask hypervisor if the pfn is really ram. - * A ballooned page contains no data and reading from such a page - * will cause high load in the hypervisor. - */ - fn = oldmem_pfn_is_ram; - if (fn) - ret = fn(pfn); + list_for_each_entry_srcu(cb, &vmcore_cb_list, next, + srcu_read_lock_held(&vmcore_cb_srcu)) { + if (unlikely(!cb->pfn_is_ram)) + continue; + ret = cb->pfn_is_ram(cb, pfn); + if (!ret) + break; + } return ret; } +static int open_vmcore(struct inode *inode, struct file *file) +{ + mutex_lock(&vmcore_mutex); + vmcore_opened = true; + if (vmcore_open + 1 == 0) { + mutex_unlock(&vmcore_mutex); + return -EBUSY; + } + vmcore_open++; + mutex_unlock(&vmcore_mutex); + + return 0; +} + +static int release_vmcore(struct inode *inode, struct file *file) +{ + mutex_lock(&vmcore_mutex); + vmcore_open--; + mutex_unlock(&vmcore_mutex); + + return 0; +} + /* Reads a page from the oldmem device from given offset. */ -static ssize_t read_from_oldmem(char *buf, size_t count, - u64 *ppos, int userbuf) +ssize_t read_from_oldmem(struct iov_iter *iter, size_t count, + u64 *ppos, bool encrypted) { unsigned long pfn, offset; - size_t nr_bytes; + ssize_t nr_bytes; ssize_t read = 0, tmp; + int idx; if (!count) return 0; @@ -98,6 +170,7 @@ static ssize_t read_from_oldmem(char *buf, size_t count, offset = (unsigned long)(*ppos % PAGE_SIZE); pfn = (unsigned long)(*ppos / PAGE_SIZE); + idx = srcu_read_lock(&vmcore_cb_srcu); do { if (count > (PAGE_SIZE - offset)) nr_bytes = PAGE_SIZE - offset; @@ -105,21 +178,29 @@ static ssize_t read_from_oldmem(char *buf, size_t count, nr_bytes = count; /* If pfn is not ram, return zeros for sparse dump files */ - if (pfn_is_ram(pfn) == 0) - memset(buf, 0, nr_bytes); - else { - tmp = copy_oldmem_page(pfn, buf, nr_bytes, - offset, userbuf); - if (tmp < 0) - return tmp; + if (!pfn_is_ram(pfn)) { + tmp = iov_iter_zero(nr_bytes, iter); + } else { + if (encrypted) + tmp = copy_oldmem_page_encrypted(iter, pfn, + nr_bytes, + offset); + else + tmp = copy_oldmem_page(iter, pfn, nr_bytes, + offset); } + if (tmp < nr_bytes) { + srcu_read_unlock(&vmcore_cb_srcu, idx); + return -EFAULT; + } + *ppos += nr_bytes; count -= nr_bytes; - buf += nr_bytes; read += nr_bytes; ++pfn; offset = 0; } while (count); + srcu_read_unlock(&vmcore_cb_srcu, idx); return read; } @@ -143,7 +224,12 @@ void __weak elfcorehdr_free(unsigned long long addr) */ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0); + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); + + return read_from_oldmem(&iter, count, ppos, false); } /* @@ -151,7 +237,13 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) */ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0); + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); + + return read_from_oldmem(&iter, count, ppos, + cc_platform_has(CC_ATTR_MEM_ENCRYPT)); } /* @@ -161,118 +253,229 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot) { + prot = pgprot_encrypted(prot); return remap_pfn_range(vma, from, pfn, size, prot); } /* - * Copy to either kernel or user space + * Architectures which support memory encryption override this. */ -static int copy_to(void *target, void *src, size_t size, int userbuf) +ssize_t __weak copy_oldmem_page_encrypted(struct iov_iter *iter, + unsigned long pfn, size_t csize, unsigned long offset) { - if (userbuf) { - if (copy_to_user((char __user *) target, src, size)) - return -EFAULT; - } else { - memcpy(target, src, size); + return copy_oldmem_page(iter, pfn, csize, offset); +} + +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +static int vmcoredd_copy_dumps(struct iov_iter *iter, u64 start, size_t size) +{ + struct vmcoredd_node *dump; + u64 offset = 0; + size_t tsz; + char *buf; + + list_for_each_entry(dump, &vmcoredd_list, list) { + if (start < offset + dump->size) { + tsz = min(offset + (u64)dump->size - start, (u64)size); + buf = dump->buf + start - offset; + if (copy_to_iter(buf, tsz, iter) < tsz) + return -EFAULT; + + size -= tsz; + start += tsz; + + /* Leave now if buffer filled already */ + if (!size) + return 0; + } + offset += dump->size; } + return 0; } +#ifdef CONFIG_MMU +static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst, + u64 start, size_t size) +{ + struct vmcoredd_node *dump; + u64 offset = 0; + size_t tsz; + char *buf; + + list_for_each_entry(dump, &vmcoredd_list, list) { + if (start < offset + dump->size) { + tsz = min(offset + (u64)dump->size - start, (u64)size); + buf = dump->buf + start - offset; + if (remap_vmalloc_range_partial(vma, dst, buf, 0, + tsz)) + return -EFAULT; + + size -= tsz; + start += tsz; + dst += tsz; + + /* Leave now if buffer filled already */ + if (!size) + return 0; + } + offset += dump->size; + } + + return 0; +} +#endif /* CONFIG_MMU */ +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ -static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, - int userbuf) +static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos) { + struct vmcore_range *m = NULL; ssize_t acc = 0, tmp; size_t tsz; u64 start; - struct vmcore *m = NULL; - if (buflen == 0 || *fpos >= vmcore_size) + if (!iov_iter_count(iter) || *fpos >= vmcore_size) return 0; - /* trim buflen to not go beyond EOF */ - if (buflen > vmcore_size - *fpos) - buflen = vmcore_size - *fpos; + iov_iter_truncate(iter, vmcore_size - *fpos); /* Read ELF core header */ if (*fpos < elfcorebuf_sz) { - tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen); - if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf)) + tsz = min(elfcorebuf_sz - (size_t)*fpos, iov_iter_count(iter)); + if (copy_to_iter(elfcorebuf + *fpos, tsz, iter) < tsz) return -EFAULT; - buflen -= tsz; *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (buflen == 0) + if (!iov_iter_count(iter)) return acc; } - /* Read Elf note segment */ + /* Read ELF note segment */ if (*fpos < elfcorebuf_sz + elfnotes_sz) { void *kaddr; - tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen); - kaddr = elfnotes_buf + *fpos - elfcorebuf_sz; - if (copy_to(buffer, kaddr, tsz, userbuf)) + /* We add device dumps before other elf notes because the + * other elf notes may not fill the elf notes buffer + * completely and we will end up with zero-filled data + * between the elf notes and the device dumps. Tools will + * then try to decode this zero-filled data as valid notes + * and we don't want that. Hence, adding device dumps before + * the other elf notes ensure that zero-filled data can be + * avoided. + */ +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP + /* Read device dumps */ + if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) { + tsz = min(elfcorebuf_sz + vmcoredd_orig_sz - + (size_t)*fpos, iov_iter_count(iter)); + start = *fpos - elfcorebuf_sz; + if (vmcoredd_copy_dumps(iter, start, tsz)) + return -EFAULT; + + *fpos += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (!iov_iter_count(iter)) + return acc; + } +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + + /* Read remaining elf notes */ + tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, + iov_iter_count(iter)); + kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz; + if (copy_to_iter(kaddr, tsz, iter) < tsz) return -EFAULT; - buflen -= tsz; + *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (buflen == 0) + if (!iov_iter_count(iter)) return acc; + + cond_resched(); } list_for_each_entry(m, &vmcore_list, list) { if (*fpos < m->offset + m->size) { tsz = (size_t)min_t(unsigned long long, m->offset + m->size - *fpos, - buflen); + iov_iter_count(iter)); start = m->paddr + *fpos - m->offset; - tmp = read_from_oldmem(buffer, tsz, &start, userbuf); + tmp = read_from_oldmem(iter, tsz, &start, + cc_platform_has(CC_ATTR_MEM_ENCRYPT)); if (tmp < 0) return tmp; - buflen -= tsz; *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (buflen == 0) + if (!iov_iter_count(iter)) return acc; } + + cond_resched(); } return acc; } -static ssize_t read_vmcore(struct file *file, char __user *buffer, - size_t buflen, loff_t *fpos) +static ssize_t read_vmcore(struct kiocb *iocb, struct iov_iter *iter) { - return __read_vmcore((__force char *) buffer, buflen, fpos, 1); + return __read_vmcore(iter, &iocb->ki_pos); } +/** + * vmcore_alloc_buf - allocate buffer in vmalloc memory + * @size: size of buffer + * + * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap + * the buffer to user-space by means of remap_vmalloc_range(). + * + * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is + * disabled and there's no need to allow users to mmap the buffer. + */ +static inline char *vmcore_alloc_buf(size_t size) +{ +#ifdef CONFIG_MMU + return vmalloc_user(size); +#else + return vzalloc(size); +#endif +} + +/* + * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is + * essential for mmap_vmcore() in order to map physically + * non-contiguous objects (ELF header, ELF note segment and memory + * regions in the 1st kernel pointed to by PT_LOAD entries) into + * virtually contiguous user-space in ELF layout. + */ +#ifdef CONFIG_MMU + /* * The vmcore fault handler uses the page cache and fills data using the - * standard __vmcore_read() function. + * standard __read_vmcore() function. * * On s390 the fault handler is used for memory regions that can't be mapped * directly with remap_pfn_range(). */ -static int mmap_vmcore_fault(struct vm_fault *vmf) +static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) { #ifdef CONFIG_S390 struct address_space *mapping = vmf->vma->vm_file->f_mapping; pgoff_t index = vmf->pgoff; + struct iov_iter iter; + struct kvec kvec; struct page *page; loff_t offset; - char *buf; int rc; page = find_or_create_page(mapping, index, GFP_KERNEL); @@ -280,12 +483,15 @@ static int mmap_vmcore_fault(struct vm_fault *vmf) return VM_FAULT_OOM; if (!PageUptodate(page)) { offset = (loff_t) index << PAGE_SHIFT; - buf = __va((page_to_pfn(page) << PAGE_SHIFT)); - rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0); + kvec.iov_base = page_address(page); + kvec.iov_len = PAGE_SIZE; + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, PAGE_SIZE); + + rc = __read_vmcore(&iter, &offset); if (rc < 0) { unlock_page(page); put_page(page); - return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; + return vmf_error(rc); } SetPageUptodate(page); } @@ -301,35 +507,6 @@ static const struct vm_operations_struct vmcore_mmap_ops = { .fault = mmap_vmcore_fault, }; -/** - * alloc_elfnotes_buf - allocate buffer for ELF note segment in - * vmalloc memory - * - * @notes_sz: size of buffer - * - * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap - * the buffer to user-space by means of remap_vmalloc_range(). - * - * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is - * disabled and there's no need to allow users to mmap the buffer. - */ -static inline char *alloc_elfnotes_buf(size_t notes_sz) -{ -#ifdef CONFIG_MMU - return vmalloc_user(notes_sz); -#else - return vzalloc(notes_sz); -#endif -} - -/* - * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is - * essential for mmap_vmcore() in order to map physically - * non-contiguous objects (ELF header, ELF note segment and memory - * regions in the 1st kernel pointed to by PT_LOAD entries) into - * virtually contiguous user-space in ELF layout. - */ -#ifdef CONFIG_MMU /* * remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages * reported as not being ram with the zero page. @@ -396,21 +573,26 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot) { + int ret, idx; + /* - * Check if oldmem_pfn_is_ram was registered to avoid - * looping over all pages without a reason. + * Check if a callback was registered to avoid looping over all + * pages without a reason. */ - if (oldmem_pfn_is_ram) - return remap_oldmem_pfn_checked(vma, from, pfn, size, prot); + idx = srcu_read_lock(&vmcore_cb_srcu); + if (!list_empty(&vmcore_cb_list)) + ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot); else - return remap_oldmem_pfn_range(vma, from, pfn, size, prot); + ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot); + srcu_read_unlock(&vmcore_cb_srcu, idx); + return ret; } static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) { size_t size = vma->vm_end - vma->vm_start; u64 start, end, len, tsz; - struct vmcore *m; + struct vmcore_range *m; start = (u64)vma->vm_pgoff << PAGE_SHIFT; end = start + size; @@ -421,8 +603,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; - vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_mod(vma, VM_MIXEDMAP, VM_MAYWRITE | VM_MAYEXEC); vma->vm_ops = &vmcore_mmap_ops; len = 0; @@ -446,11 +627,46 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) if (start < elfcorebuf_sz + elfnotes_sz) { void *kaddr; + /* We add device dumps before other elf notes because the + * other elf notes may not fill the elf notes buffer + * completely and we will end up with zero-filled data + * between the elf notes and the device dumps. Tools will + * then try to decode this zero-filled data as valid notes + * and we don't want that. Hence, adding device dumps before + * the other elf notes ensure that zero-filled data can be + * avoided. This also ensures that the device dumps and + * other elf notes can be properly mmaped at page aligned + * address. + */ +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP + /* Read device dumps */ + if (start < elfcorebuf_sz + vmcoredd_orig_sz) { + u64 start_off; + + tsz = min(elfcorebuf_sz + vmcoredd_orig_sz - + (size_t)start, size); + start_off = start - elfcorebuf_sz; + if (vmcoredd_mmap_dumps(vma, vma->vm_start + len, + start_off, tsz)) + goto fail; + + size -= tsz; + start += tsz; + len += tsz; + + /* leave now if filled buffer already */ + if (!size) + return 0; + } +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + + /* Read remaining elf notes */ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size); - kaddr = elfnotes_buf + start - elfcorebuf_sz; + kaddr = elfnotes_buf + start - elfcorebuf_sz - vmcoredd_orig_sz; if (remap_vmalloc_range_partial(vma, vma->vm_start + len, - kaddr, tsz)) + kaddr, 0, tsz)) goto fail; + size -= tsz; start += tsz; len += tsz; @@ -491,22 +707,19 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) } #endif -static const struct file_operations proc_vmcore_operations = { - .read = read_vmcore, - .llseek = default_llseek, - .mmap = mmap_vmcore, +static const struct proc_ops vmcore_proc_ops = { + .proc_open = open_vmcore, + .proc_release = release_vmcore, + .proc_read_iter = read_vmcore, + .proc_lseek = default_llseek, + .proc_mmap = mmap_vmcore, }; -static struct vmcore* __init get_new_element(void) -{ - return kzalloc(sizeof(struct vmcore), GFP_KERNEL); -} - -static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz, - struct list_head *vc_list) +static u64 get_vmcore_size(size_t elfsz, size_t elfnotesegsz, + struct list_head *vc_list) { + struct vmcore_range *m; u64 size; - struct vmcore *m; size = elfsz + elfnotesegsz; list_for_each_entry(m, vc_list, list) { @@ -665,7 +878,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, return rc; *notes_sz = roundup(phdr_sz, PAGE_SIZE); - *notes_buf = alloc_elfnotes_buf(*notes_sz); + *notes_buf = vmcore_alloc_buf(*notes_sz); if (!*notes_buf) return -ENOMEM; @@ -681,7 +894,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, phdr.p_offset = roundup(note_off, PAGE_SIZE); phdr.p_vaddr = phdr.p_paddr = 0; phdr.p_filesz = phdr.p_memsz = phdr_sz; - phdr.p_align = 0; + phdr.p_align = 4; /* Add merged PT_NOTE program header*/ tmp = elfptr + sizeof(Elf64_Ehdr); @@ -698,6 +911,11 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, /* Modify e_phnum to reflect merged headers. */ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + /* Store the size of all notes. We need this to update the note + * header when the device dumps will be added. + */ + elfnotes_orig_sz = phdr.p_memsz; + return 0; } @@ -851,7 +1069,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, return rc; *notes_sz = roundup(phdr_sz, PAGE_SIZE); - *notes_buf = alloc_elfnotes_buf(*notes_sz); + *notes_buf = vmcore_alloc_buf(*notes_sz); if (!*notes_buf) return -ENOMEM; @@ -867,7 +1085,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, phdr.p_offset = roundup(note_off, PAGE_SIZE); phdr.p_vaddr = phdr.p_paddr = 0; phdr.p_filesz = phdr.p_memsz = phdr_sz; - phdr.p_align = 0; + phdr.p_align = 4; /* Add merged PT_NOTE program header*/ tmp = elfptr + sizeof(Elf32_Ehdr); @@ -884,6 +1102,11 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, /* Modify e_phnum to reflect merged headers. */ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + /* Store the size of all notes. We need this to update the note + * header when the device dumps will be added. + */ + elfnotes_orig_sz = phdr.p_memsz; + return 0; } @@ -898,12 +1121,11 @@ static int __init process_ptload_program_headers_elf64(char *elfptr, Elf64_Ehdr *ehdr_ptr; Elf64_Phdr *phdr_ptr; loff_t vmcore_off; - struct vmcore *new; ehdr_ptr = (Elf64_Ehdr *)elfptr; phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ - /* Skip Elf header, program headers and Elf note segment. */ + /* Skip ELF header, program headers and ELF note segment. */ vmcore_off = elfsz + elfnotes_sz; for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { @@ -917,13 +1139,8 @@ static int __init process_ptload_program_headers_elf64(char *elfptr, end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE); size = end - start; - /* Add this contiguous chunk of memory to vmcore list.*/ - new = get_new_element(); - if (!new) + if (vmcore_alloc_add_range(vc_list, start, size)) return -ENOMEM; - new->paddr = start; - new->size = size; - list_add_tail(&new->list, vc_list); /* Update the program header offset. */ phdr_ptr->p_offset = vmcore_off + (paddr - start); @@ -941,12 +1158,11 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, Elf32_Ehdr *ehdr_ptr; Elf32_Phdr *phdr_ptr; loff_t vmcore_off; - struct vmcore *new; ehdr_ptr = (Elf32_Ehdr *)elfptr; phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ - /* Skip Elf header, program headers and Elf note segment. */ + /* Skip ELF header, program headers and ELF note segment. */ vmcore_off = elfsz + elfnotes_sz; for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { @@ -960,13 +1176,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE); size = end - start; - /* Add this contiguous chunk of memory to vmcore list.*/ - new = get_new_element(); - if (!new) + if (vmcore_alloc_add_range(vc_list, start, size)) return -ENOMEM; - new->paddr = start; - new->size = size; - list_add_tail(&new->list, vc_list); /* Update the program header offset */ phdr_ptr->p_offset = vmcore_off + (paddr - start); @@ -976,13 +1187,13 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, } /* Sets offset fields of vmcore elements. */ -static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, - struct list_head *vc_list) +static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, + struct list_head *vc_list) { + struct vmcore_range *m; loff_t vmcore_off; - struct vmcore *m; - /* Skip Elf header, program headers and Elf note segment. */ + /* Skip ELF header, program headers and ELF note segment. */ vmcore_off = elfsz + elfnotes_sz; list_for_each_entry(m, vc_list, list) { @@ -1007,7 +1218,7 @@ static int __init parse_crash_elf64_headers(void) addr = elfcorehdr_addr; - /* Read Elf header */ + /* Read ELF header */ rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr); if (rc < 0) return rc; @@ -1063,7 +1274,7 @@ static int __init parse_crash_elf32_headers(void) addr = elfcorehdr_addr; - /* Read Elf header */ + /* Read ELF header */ rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr); if (rc < 0) return rc; @@ -1145,6 +1356,359 @@ static int __init parse_crash_elf_headers(void) return 0; } +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +/** + * vmcoredd_write_header - Write vmcore device dump header at the + * beginning of the dump's buffer. + * @buf: Output buffer where the note is written + * @data: Dump info + * @size: Size of the dump + * + * Fills beginning of the dump's buffer with vmcore device dump header. + */ +static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data, + u32 size) +{ + struct vmcoredd_header *vdd_hdr = (struct vmcoredd_header *)buf; + + vdd_hdr->n_namesz = sizeof(vdd_hdr->name); + vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name); + vdd_hdr->n_type = NT_VMCOREDD; + + strscpy_pad(vdd_hdr->name, VMCOREDD_NOTE_NAME); + strscpy_pad(vdd_hdr->dump_name, data->dump_name); +} + +/** + * vmcoredd_update_program_headers - Update all ELF program headers + * @elfptr: Pointer to elf header + * @elfnotesz: Size of elf notes aligned to page size + * @vmcoreddsz: Size of device dumps to be added to elf note header + * + * Determine type of ELF header (Elf64 or Elf32) and update the elf note size. + * Also update the offsets of all the program headers after the elf note header. + */ +static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz, + size_t vmcoreddsz) +{ + unsigned char *e_ident = (unsigned char *)elfptr; + u64 start, end, size; + loff_t vmcore_off; + u32 i; + + vmcore_off = elfcorebuf_sz + elfnotesz; + + if (e_ident[EI_CLASS] == ELFCLASS64) { + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfptr; + Elf64_Phdr *phdr = (Elf64_Phdr *)(elfptr + sizeof(Elf64_Ehdr)); + + /* Update all program headers */ + for (i = 0; i < ehdr->e_phnum; i++, phdr++) { + if (phdr->p_type == PT_NOTE) { + /* Update note size */ + phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz; + phdr->p_filesz = phdr->p_memsz; + continue; + } + + start = rounddown(phdr->p_offset, PAGE_SIZE); + end = roundup(phdr->p_offset + phdr->p_memsz, + PAGE_SIZE); + size = end - start; + phdr->p_offset = vmcore_off + (phdr->p_offset - start); + vmcore_off += size; + } + } else { + Elf32_Ehdr *ehdr = (Elf32_Ehdr *)elfptr; + Elf32_Phdr *phdr = (Elf32_Phdr *)(elfptr + sizeof(Elf32_Ehdr)); + + /* Update all program headers */ + for (i = 0; i < ehdr->e_phnum; i++, phdr++) { + if (phdr->p_type == PT_NOTE) { + /* Update note size */ + phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz; + phdr->p_filesz = phdr->p_memsz; + continue; + } + + start = rounddown(phdr->p_offset, PAGE_SIZE); + end = roundup(phdr->p_offset + phdr->p_memsz, + PAGE_SIZE); + size = end - start; + phdr->p_offset = vmcore_off + (phdr->p_offset - start); + vmcore_off += size; + } + } +} + +/** + * vmcoredd_update_size - Update the total size of the device dumps and update + * ELF header + * @dump_size: Size of the current device dump to be added to total size + * + * Update the total size of all the device dumps and update the ELF program + * headers. Calculate the new offsets for the vmcore list and update the + * total vmcore size. + */ +static void vmcoredd_update_size(size_t dump_size) +{ + vmcoredd_orig_sz += dump_size; + elfnotes_sz = roundup(elfnotes_orig_sz, PAGE_SIZE) + vmcoredd_orig_sz; + vmcoredd_update_program_headers(elfcorebuf, elfnotes_sz, + vmcoredd_orig_sz); + + /* Update vmcore list offsets */ + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); + + vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz, + &vmcore_list); + proc_vmcore->size = vmcore_size; +} + +/** + * vmcore_add_device_dump - Add a buffer containing device dump to vmcore + * @data: dump info. + * + * Allocate a buffer and invoke the calling driver's dump collect routine. + * Write ELF note at the beginning of the buffer to indicate vmcore device + * dump and add the dump to global list. + */ +int vmcore_add_device_dump(struct vmcoredd_data *data) +{ + struct vmcoredd_node *dump; + void *buf = NULL; + size_t data_size; + int ret; + + if (vmcoredd_disabled) { + pr_err_once("Device dump is disabled\n"); + return -EINVAL; + } + + if (!data || !strlen(data->dump_name) || + !data->vmcoredd_callback || !data->size) + return -EINVAL; + + dump = vzalloc(sizeof(*dump)); + if (!dump) + return -ENOMEM; + + /* Keep size of the buffer page aligned so that it can be mmaped */ + data_size = roundup(sizeof(struct vmcoredd_header) + data->size, + PAGE_SIZE); + + /* Allocate buffer for driver's to write their dumps */ + buf = vmcore_alloc_buf(data_size); + if (!buf) { + ret = -ENOMEM; + goto out_err; + } + + vmcoredd_write_header(buf, data, data_size - + sizeof(struct vmcoredd_header)); + + /* Invoke the driver's dump collection routing */ + ret = data->vmcoredd_callback(data, buf + + sizeof(struct vmcoredd_header)); + if (ret) + goto out_err; + + dump->buf = buf; + dump->size = data_size; + + /* Add the dump to driver sysfs list and update the elfcore hdr */ + scoped_guard(mutex, &vmcore_mutex) { + if (vmcore_opened) + pr_warn_once("Unexpected adding of device dump\n"); + if (vmcore_open) { + ret = -EBUSY; + goto out_err; + } + + list_add_tail(&dump->list, &vmcoredd_list); + vmcoredd_update_size(data_size); + } + return 0; + +out_err: + vfree(buf); + vfree(dump); + + return ret; +} +EXPORT_SYMBOL(vmcore_add_device_dump); +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + +#ifdef CONFIG_PROC_VMCORE_DEVICE_RAM +static int vmcore_realloc_elfcore_buffer_elf64(size_t new_size) +{ + char *elfcorebuf_new; + + if (WARN_ON_ONCE(new_size < elfcorebuf_sz)) + return -EINVAL; + if (get_order(elfcorebuf_sz_orig) == get_order(new_size)) { + elfcorebuf_sz_orig = new_size; + return 0; + } + + elfcorebuf_new = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(new_size)); + if (!elfcorebuf_new) + return -ENOMEM; + memcpy(elfcorebuf_new, elfcorebuf, elfcorebuf_sz); + free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig)); + elfcorebuf = elfcorebuf_new; + elfcorebuf_sz_orig = new_size; + return 0; +} + +static void vmcore_reset_offsets_elf64(void) +{ + Elf64_Phdr *phdr_start = (Elf64_Phdr *)(elfcorebuf + sizeof(Elf64_Ehdr)); + loff_t vmcore_off = elfcorebuf_sz + elfnotes_sz; + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfcorebuf; + Elf64_Phdr *phdr; + int i; + + for (i = 0, phdr = phdr_start; i < ehdr->e_phnum; i++, phdr++) { + u64 start, end; + + /* + * After merge_note_headers_elf64() we should only have a single + * PT_NOTE entry that starts immediately after elfcorebuf_sz. + */ + if (phdr->p_type == PT_NOTE) { + phdr->p_offset = elfcorebuf_sz; + continue; + } + + start = rounddown(phdr->p_offset, PAGE_SIZE); + end = roundup(phdr->p_offset + phdr->p_memsz, PAGE_SIZE); + phdr->p_offset = vmcore_off + (phdr->p_offset - start); + vmcore_off = vmcore_off + end - start; + } + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); +} + +static int vmcore_add_device_ram_elf64(struct list_head *list, size_t count) +{ + Elf64_Phdr *phdr_start = (Elf64_Phdr *)(elfcorebuf + sizeof(Elf64_Ehdr)); + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfcorebuf; + struct vmcore_range *cur; + Elf64_Phdr *phdr; + size_t new_size; + int rc; + + if ((Elf32_Half)(ehdr->e_phnum + count) != ehdr->e_phnum + count) { + pr_err("too many device ram ranges\n"); + return -ENOSPC; + } + + /* elfcorebuf_sz must always cover full pages. */ + new_size = sizeof(Elf64_Ehdr) + + (ehdr->e_phnum + count) * sizeof(Elf64_Phdr); + new_size = roundup(new_size, PAGE_SIZE); + + /* + * Make sure we have sufficient space to include the new PT_LOAD + * entries. + */ + rc = vmcore_realloc_elfcore_buffer_elf64(new_size); + if (rc) { + pr_err("resizing elfcore failed\n"); + return rc; + } + + /* Modify our used elfcore buffer size to cover the new entries. */ + elfcorebuf_sz = new_size; + + /* Fill the added PT_LOAD entries. */ + phdr = phdr_start + ehdr->e_phnum; + list_for_each_entry(cur, list, list) { + WARN_ON_ONCE(!IS_ALIGNED(cur->paddr | cur->size, PAGE_SIZE)); + elfcorehdr_fill_device_ram_ptload_elf64(phdr, cur->paddr, cur->size); + + /* p_offset will be adjusted later. */ + phdr++; + ehdr->e_phnum++; + } + list_splice_tail(list, &vmcore_list); + + /* We changed elfcorebuf_sz and added new entries; reset all offsets. */ + vmcore_reset_offsets_elf64(); + + /* Finally, recalculate the total vmcore size. */ + vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz, + &vmcore_list); + proc_vmcore->size = vmcore_size; + return 0; +} + +static void vmcore_process_device_ram(struct vmcore_cb *cb) +{ + unsigned char *e_ident = (unsigned char *)elfcorebuf; + struct vmcore_range *first, *m; + LIST_HEAD(list); + int count; + + /* We only support Elf64 dumps for now. */ + if (WARN_ON_ONCE(e_ident[EI_CLASS] != ELFCLASS64)) { + pr_err("device ram ranges only support Elf64\n"); + return; + } + + if (cb->get_device_ram(cb, &list)) { + pr_err("obtaining device ram ranges failed\n"); + return; + } + count = list_count_nodes(&list); + if (!count) + return; + + /* + * For some reason these ranges are already know? Might happen + * with unusual register->unregister->register sequences; we'll simply + * sanity check using the first range. + */ + first = list_first_entry(&list, struct vmcore_range, list); + list_for_each_entry(m, &vmcore_list, list) { + unsigned long long m_end = m->paddr + m->size; + unsigned long long first_end = first->paddr + first->size; + + if (first->paddr < m_end && m->paddr < first_end) + goto out_free; + } + + /* If adding the mem nodes succeeds, they must not be freed. */ + if (!vmcore_add_device_ram_elf64(&list, count)) + return; +out_free: + vmcore_free_ranges(&list); +} +#else /* !CONFIG_PROC_VMCORE_DEVICE_RAM */ +static void vmcore_process_device_ram(struct vmcore_cb *cb) +{ +} +#endif /* CONFIG_PROC_VMCORE_DEVICE_RAM */ + +/* Free all dumps in vmcore device dump list */ +static void vmcore_free_device_dumps(void) +{ +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP + mutex_lock(&vmcore_mutex); + while (!list_empty(&vmcoredd_list)) { + struct vmcoredd_node *dump; + + dump = list_first_entry(&vmcoredd_list, struct vmcoredd_node, + list); + list_del(&dump->list); + vfree(dump->buf); + vfree(dump); + } + mutex_unlock(&vmcore_mutex); +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ +} + /* Init function for vmcore module. */ static int __init vmcore_init(void) { @@ -1162,13 +1726,14 @@ static int __init vmcore_init(void) return rc; rc = parse_crash_elf_headers(); if (rc) { - pr_warn("Kdump: vmcore not initialized\n"); + elfcorehdr_free(elfcorehdr_addr); + pr_warn("not initialized\n"); return rc; } elfcorehdr_free(elfcorehdr_addr); elfcorehdr_addr = ELFCORE_ADDR_ERR; - proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); + proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &vmcore_proc_ops); if (proc_vmcore) proc_vmcore->size = vmcore_size; return 0; @@ -1178,20 +1743,14 @@ fs_initcall(vmcore_init); /* Cleanup function for vmcore module. */ void vmcore_cleanup(void) { - struct list_head *pos, *next; - if (proc_vmcore) { proc_remove(proc_vmcore); proc_vmcore = NULL; } - /* clear the vmcore list. */ - list_for_each_safe(pos, next, &vmcore_list) { - struct vmcore *m; - - m = list_entry(pos, struct vmcore, list); - list_del(&m->list); - kfree(m); - } + vmcore_free_ranges(&vmcore_list); free_elfcorebuf(); + + /* clear vmcore device dump list */ + vmcore_free_device_dumps(); } |
