summaryrefslogtreecommitdiff
path: root/fs/proc
diff options
context:
space:
mode:
Diffstat (limited to 'fs/proc')
-rw-r--r--fs/proc/Kconfig62
-rw-r--r--fs/proc/Makefile5
-rw-r--r--fs/proc/array.c386
-rw-r--r--fs/proc/base.c1707
-rw-r--r--fs/proc/bootconfig.c102
-rw-r--r--fs/proc/cmdline.c23
-rw-r--r--fs/proc/consoles.c48
-rw-r--r--fs/proc/cpuinfo.c16
-rw-r--r--fs/proc/devices.c28
-rw-r--r--fs/proc/fd.c282
-rw-r--r--fs/proc/fd.h4
-rw-r--r--fs/proc/generic.c447
-rw-r--r--fs/proc/inode.c565
-rw-r--r--fs/proc/internal.h283
-rw-r--r--fs/proc/interrupts.c19
-rw-r--r--fs/proc/kcore.c745
-rw-r--r--fs/proc/kmsg.c23
-rw-r--r--fs/proc/loadavg.c26
-rw-r--r--fs/proc/meminfo.c110
-rw-r--r--fs/proc/namespaces.c58
-rw-r--r--fs/proc/nommu.c24
-rw-r--r--fs/proc/page.c337
-rw-r--r--fs/proc/proc_net.c259
-rw-r--r--fs/proc/proc_sysctl.c1058
-rw-r--r--fs/proc/proc_tty.c28
-rw-r--r--fs/proc/root.c442
-rw-r--r--fs/proc/self.c38
-rw-r--r--fs/proc/softirqs.c21
-rw-r--r--fs/proc/stat.c140
-rw-r--r--fs/proc/task_mmu.c2817
-rw-r--r--fs/proc/task_nommu.c174
-rw-r--r--fs/proc/thread_self.c36
-rw-r--r--fs/proc/uptime.c42
-rw-r--r--fs/proc/util.c24
-rw-r--r--fs/proc/version.c19
-rw-r--r--fs/proc/vmcore.c901
36 files changed, 7552 insertions, 3747 deletions
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 1ade1206bb89..6ae966c561e7 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config PROC_FS
bool "/proc file system support" if EXPERT
default y
@@ -22,7 +23,7 @@ config PROC_FS
/proc" or the equivalent line in /etc/fstab does the job.
The /proc file system is explained in the file
- <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage
+ <file:Documentation/filesystems/proc.rst> and on the proc(5) manpage
("man 5 proc").
This option will enlarge your kernel by about 67 KB. Several
@@ -31,6 +32,7 @@ config PROC_FS
config PROC_KCORE
bool "/proc/kcore support" if !ARM
depends on PROC_FS && MMU
+ select VMCORE_INFO
help
Provides a virtual ELF core file of the live kernel. This can
be read with gdb and other ELF tools. No modifications can be
@@ -40,22 +42,57 @@ config PROC_VMCORE
bool "/proc/vmcore support"
depends on PROC_FS && CRASH_DUMP
default y
- help
- Exports the dump image of crashed kernel in ELF format.
+ help
+ Exports the dump image of crashed kernel in ELF format.
+
+config PROC_VMCORE_DEVICE_DUMP
+ bool "Device Hardware/Firmware Log Collection"
+ depends on PROC_VMCORE
+ default n
+ help
+ After kernel panic, device drivers can collect the device
+ specific snapshot of their hardware or firmware before the
+ underlying devices are initialized in crash recovery kernel.
+ Note that the device driver must be present in the crash
+ recovery kernel's initramfs to collect its underlying device
+ snapshot.
+
+ If you say Y here, the collected device dumps will be added
+ as ELF notes to /proc/vmcore. You can still disable device
+ dump using the kernel command line option 'novmcoredd'.
+
+config NEED_PROC_VMCORE_DEVICE_RAM
+ bool
+
+config PROC_VMCORE_DEVICE_RAM
+ def_bool y
+ depends on PROC_VMCORE && NEED_PROC_VMCORE_DEVICE_RAM
+ depends on VIRTIO_MEM
+ help
+ If the elfcore hdr is allocated and prepared by the dump kernel
+ ("2nd kernel") instead of the crashed kernel, RAM provided by memory
+ devices such as virtio-mem will not be included in the dump
+ image, because only the device driver can properly detect them.
+
+ With this config enabled, these RAM ranges will be queried from the
+ device drivers once the device gets probed, so they can be included
+ in the crash dump.
+
+ Relevant architectures should select NEED_PROC_VMCORE_DEVICE_RAM.
config PROC_SYSCTL
bool "Sysctl support (/proc/sys)" if EXPERT
depends on PROC_FS
select SYSCTL
default y
- ---help---
+ help
The sysctl interface provides a means of dynamically changing
certain kernel parameters and variables on the fly without requiring
a recompile of the kernel or reboot of the system. The primary
interface is through /proc/sys. If you say Y here a tree of
modifiable sysctl entries will be generated beneath the
- /proc/sys directory. They are explained in the files
- in <file:Documentation/sysctl/>. Note that enabling this
+ /proc/sys directory. They are explained in the files
+ in <file:Documentation/admin-guide/sysctl/>. Note that enabling this
option will enlarge the kernel by at least 8 KB.
As it is generally a good thing, you should say Y here unless
@@ -70,14 +107,23 @@ config PROC_PAGE_MONITOR
Various /proc files exist to monitor process memory utilization:
/proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
/proc/kpagecount, and /proc/kpageflags. Disabling these
- interfaces will reduce the size of the kernel by approximately 4kb.
+ interfaces will reduce the size of the kernel by approximately 4kb.
config PROC_CHILDREN
bool "Include /proc/<pid>/task/<tid>/children file"
+ depends on PROC_FS
default n
help
Provides a fast way to retrieve first level children pids of a task. See
- <file:Documentation/filesystems/proc.txt> for more information.
+ <file:Documentation/filesystems/proc.rst> for more information.
Say Y if you are running any user-space software which takes benefit from
this interface. For example, rkt is such a piece of software.
+
+config PROC_PID_ARCH_STATUS
+ def_bool n
+ depends on PROC_FS
+
+config PROC_CPU_RESCTRL
+ def_bool n
+ depends on PROC_FS
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 12c6922c913c..7b4db9c56e6a 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -1,10 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the Linux proc filesystem routines.
#
obj-y += proc.o
-CFLAGS_task_mmu.o += $(call cc-option,-Wno-override-init,)
+CFLAGS_task_mmu.o += -Wno-override-init
proc-y := nommu.o task_nommu.o
proc-$(CONFIG_MMU) := task_mmu.o
@@ -20,6 +21,7 @@ proc-y += loadavg.o
proc-y += meminfo.o
proc-y += stat.o
proc-y += uptime.o
+proc-y += util.o
proc-y += version.o
proc-y += softirqs.o
proc-y += namespaces.o
@@ -31,3 +33,4 @@ proc-$(CONFIG_PROC_KCORE) += kcore.o
proc-$(CONFIG_PROC_VMCORE) += vmcore.o
proc-$(CONFIG_PRINTK) += kmsg.o
proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
+proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 88c355574aa0..42932f88141a 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/proc/array.c
*
@@ -55,6 +56,7 @@
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/time.h>
+#include <linux/time_namespace.h>
#include <linux/kernel.h>
#include <linux/kernel_stat.h>
#include <linux/tty.h>
@@ -62,11 +64,11 @@
#include <linux/mman.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
+#include <linux/sched/task_stack.h>
#include <linux/sched/task.h>
#include <linux/sched/cputime.h>
#include <linux/proc_fs.h>
#include <linux/ioport.h>
-#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
@@ -83,32 +85,36 @@
#include <linux/delayacct.h>
#include <linux/seq_file.h>
#include <linux/pid_namespace.h>
+#include <linux/prctl.h>
#include <linux/ptrace.h>
-#include <linux/tracehook.h>
#include <linux/string_helpers.h>
#include <linux/user_namespace.h>
#include <linux/fs_struct.h>
+#include <linux/kthread.h>
+#include <linux/mmu_context.h>
-#include <asm/pgtable.h>
#include <asm/processor.h>
#include "internal.h"
-static inline void task_name(struct seq_file *m, struct task_struct *p)
+void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
{
- char *buf;
- size_t size;
- char tcomm[sizeof(p->comm)];
- int ret;
+ char tcomm[64];
- get_task_comm(tcomm, p);
-
- seq_puts(m, "Name:\t");
-
- size = seq_get_buf(m, &buf);
- ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
- seq_commit(m, ret < size ? ret : -1);
+ /*
+ * Test before PF_KTHREAD because all workqueue worker threads are
+ * kernel threads.
+ */
+ if (p->flags & PF_WQ_WORKER)
+ wq_worker_comm(tcomm, sizeof(tcomm), p);
+ else if (p->flags & PF_KTHREAD)
+ get_kthread_comm(tcomm, sizeof(tcomm), p);
+ else
+ get_task_comm(tcomm, p);
- seq_putc(m, '\n');
+ if (escape)
+ seq_escape_str(m, tcomm, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ else
+ seq_printf(m, "%.64s", tcomm);
}
/*
@@ -118,43 +124,25 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
* simple bit tests.
*/
static const char * const task_state_array[] = {
- "R (running)", /* 0 */
- "S (sleeping)", /* 1 */
- "D (disk sleep)", /* 2 */
- "T (stopped)", /* 4 */
- "t (tracing stop)", /* 8 */
- "X (dead)", /* 16 */
- "Z (zombie)", /* 32 */
+
+ /* states in TASK_REPORT: */
+ "R (running)", /* 0x00 */
+ "S (sleeping)", /* 0x01 */
+ "D (disk sleep)", /* 0x02 */
+ "T (stopped)", /* 0x04 */
+ "t (tracing stop)", /* 0x08 */
+ "X (dead)", /* 0x10 */
+ "Z (zombie)", /* 0x20 */
+ "P (parked)", /* 0x40 */
+
+ /* states beyond TASK_REPORT: */
+ "I (idle)", /* 0x80 */
};
static inline const char *get_task_state(struct task_struct *tsk)
{
- unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
-
- /*
- * Parked tasks do not run; they sit in __kthread_parkme().
- * Without this check, we would report them as running, which is
- * clearly wrong, so we report them as sleeping instead.
- */
- if (tsk->state == TASK_PARKED)
- state = TASK_INTERRUPTIBLE;
-
- BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
-
- return task_state_array[fls(state)];
-}
-
-static inline int get_task_umask(struct task_struct *tsk)
-{
- struct fs_struct *fs;
- int umask = -ENOENT;
-
- task_lock(tsk);
- fs = tsk->fs;
- if (fs)
- umask = fs->umask;
- task_unlock(tsk);
- return umask;
+ BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array));
+ return task_state_array[task_state_index(tsk)];
}
static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
@@ -162,35 +150,34 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
{
struct user_namespace *user_ns = seq_user_ns(m);
struct group_info *group_info;
- int g, umask;
+ int g, umask = -1;
struct task_struct *tracer;
const struct cred *cred;
pid_t ppid, tpid = 0, tgid, ngid;
unsigned int max_fds = 0;
rcu_read_lock();
- ppid = pid_alive(p) ?
- task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
-
tracer = ptrace_parent(p);
if (tracer)
tpid = task_pid_nr_ns(tracer, ns);
+ ppid = task_ppid_nr_ns(p, ns);
tgid = task_tgid_nr_ns(p, ns);
ngid = task_numa_group_id(p);
cred = get_task_cred(p);
- umask = get_task_umask(p);
- if (umask >= 0)
- seq_printf(m, "Umask:\t%#04o\n", umask);
-
task_lock(p);
+ if (p->fs)
+ umask = p->fs->umask;
if (p->files)
max_fds = files_fdtable(p->files)->max_fds;
task_unlock(p);
rcu_read_unlock();
- seq_printf(m, "State:\t%s", get_task_state(p));
+ if (umask >= 0)
+ seq_printf(m, "Umask:\t%#04o\n", umask);
+ seq_puts(m, "State:\t");
+ seq_puts(m, get_task_state(p));
seq_put_decimal_ull(m, "\nTgid:\t", tgid);
seq_put_decimal_ull(m, "\nNgid:\t", ngid);
@@ -231,6 +218,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns));
#endif
seq_putc(m, '\n');
+
+ seq_printf(m, "Kthread:\t%c\n", p->flags & PF_KTHREAD ? '1' : '0');
}
void render_sigset_t(struct seq_file *m, const char *header,
@@ -255,8 +244,8 @@ void render_sigset_t(struct seq_file *m, const char *header,
seq_putc(m, '\n');
}
-static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
- sigset_t *catch)
+static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *sigign,
+ sigset_t *sigcatch)
{
struct k_sigaction *k;
int i;
@@ -264,9 +253,9 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
k = p->sighand->action;
for (i = 1; i <= _NSIG; ++i, ++k) {
if (k->sa.sa_handler == SIG_IGN)
- sigaddset(ign, i);
+ sigaddset(sigign, i);
else if (k->sa.sa_handler != SIG_DFL)
- sigaddset(catch, i);
+ sigaddset(sigcatch, i);
}
}
@@ -275,7 +264,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
unsigned long flags;
sigset_t pending, shpending, blocked, ignored, caught;
int num_threads = 0;
- unsigned long qsize = 0;
+ unsigned int qsize = 0;
unsigned long qlim = 0;
sigemptyset(&pending);
@@ -291,7 +280,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
collect_sigign_sigcatch(p, &ignored, &caught);
num_threads = get_nr_threads(p);
rcu_read_lock(); /* FIXME: is this correct? */
- qsize = atomic_read(&__task_cred(p)->user->sigpending);
+ qsize = get_rlimit_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING);
rcu_read_unlock();
qlim = task_rlimit(p, RLIMIT_SIGPENDING);
unlock_task_sighand(p, &flags);
@@ -312,13 +301,8 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
static void render_cap_t(struct seq_file *m, const char *header,
kernel_cap_t *a)
{
- unsigned __capi;
-
seq_puts(m, header);
- CAP_FOR_EACH_U32(__capi) {
- seq_printf(m, "%08x",
- a->cap[CAP_LAST_U32 - __capi]);
- }
+ seq_put_hex_ll(m, NULL, a->val, 16);
seq_putc(m, '\n');
}
@@ -349,7 +333,63 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p));
#ifdef CONFIG_SECCOMP
seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
+#ifdef CONFIG_SECCOMP_FILTER
+ seq_put_decimal_ull(m, "\nSeccomp_filters:\t",
+ atomic_read(&p->seccomp.filter_count));
+#endif
#endif
+ seq_puts(m, "\nSpeculation_Store_Bypass:\t");
+ switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
+ case -EINVAL:
+ seq_puts(m, "unknown");
+ break;
+ case PR_SPEC_NOT_AFFECTED:
+ seq_puts(m, "not vulnerable");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE:
+ seq_puts(m, "thread force mitigated");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
+ seq_puts(m, "thread mitigated");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_ENABLE:
+ seq_puts(m, "thread vulnerable");
+ break;
+ case PR_SPEC_DISABLE:
+ seq_puts(m, "globally mitigated");
+ break;
+ default:
+ seq_puts(m, "vulnerable");
+ break;
+ }
+
+ seq_puts(m, "\nSpeculationIndirectBranch:\t");
+ switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_INDIRECT_BRANCH)) {
+ case -EINVAL:
+ seq_puts(m, "unsupported");
+ break;
+ case PR_SPEC_NOT_AFFECTED:
+ seq_puts(m, "not affected");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE:
+ seq_puts(m, "conditional force disabled");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
+ seq_puts(m, "conditional disabled");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_ENABLE:
+ seq_puts(m, "conditional enabled");
+ break;
+ case PR_SPEC_ENABLE:
+ seq_puts(m, "always enabled");
+ break;
+ case PR_SPEC_DISABLE:
+ seq_puts(m, "always disabled");
+ break;
+ default:
+ seq_puts(m, "unknown");
+ break;
+ }
seq_putc(m, '\n');
}
@@ -364,9 +404,34 @@ static inline void task_context_switch_counts(struct seq_file *m,
static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
{
seq_printf(m, "Cpus_allowed:\t%*pb\n",
- cpumask_pr_args(&task->cpus_allowed));
+ cpumask_pr_args(&task->cpus_mask));
seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
- cpumask_pr_args(&task->cpus_allowed));
+ cpumask_pr_args(&task->cpus_mask));
+}
+
+static inline void task_core_dumping(struct seq_file *m, struct task_struct *task)
+{
+ seq_put_decimal_ull(m, "CoreDumping:\t", !!task->signal->core_state);
+ seq_putc(m, '\n');
+}
+
+static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm)
+{
+ bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE);
+
+ if (thp_enabled)
+ thp_enabled = !mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
+ seq_printf(m, "THP_enabled:\t%d\n", thp_enabled);
+}
+
+static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm)
+{
+ seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm));
+}
+
+__weak void arch_proc_pid_thread_features(struct seq_file *m,
+ struct task_struct *task)
+{
}
int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
@@ -374,11 +439,17 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
{
struct mm_struct *mm = get_task_mm(task);
- task_name(m, task);
+ seq_puts(m, "Name:\t");
+ proc_task_name(m, task, true);
+ seq_putc(m, '\n');
+
task_state(m, ns, pid, task);
if (mm) {
task_mem(m, mm);
+ task_core_dumping(m, task);
+ task_thp_status(m, mm);
+ task_untag_mask(m, mm);
mmput(mm);
}
task_sig(m, task);
@@ -387,6 +458,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
task_cpus_allowed(m, task);
cpuset_task_status_allowed(m, task);
task_context_switch_counts(m, task);
+ arch_proc_pid_thread_features(m, task);
return 0;
}
@@ -403,13 +475,12 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
int permitted;
struct mm_struct *mm;
unsigned long long start_time;
- unsigned long cmin_flt = 0, cmaj_flt = 0;
- unsigned long min_flt = 0, maj_flt = 0;
- u64 cutime, cstime, utime, stime;
- u64 cgtime, gtime;
+ unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt;
+ u64 cutime, cstime, cgtime, utime, stime, gtime;
unsigned long rsslim = 0;
- char tcomm[sizeof(task->comm)];
unsigned long flags;
+ int exit_code = task->exit_code;
+ struct signal_struct *sig = task->signal;
state = *get_task_state(task);
vsize = eip = esp = 0;
@@ -421,19 +492,24 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
* esp and eip are intentionally zeroed out. There is no
* non-racy way to read them without freezing the task.
* Programs that need reliable values can use ptrace(2).
+ *
+ * The only exception is if the task is core dumping because
+ * a program is not able to use ptrace(2) in that case. It is
+ * safe because the task has stopped executing permanently.
*/
+ if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE|PF_POSTCOREDUMP))) {
+ if (try_get_task_stack(task)) {
+ eip = KSTK_EIP(task);
+ esp = KSTK_ESP(task);
+ put_task_stack(task);
+ }
+ }
}
- get_task_comm(tcomm, task);
-
sigemptyset(&sigign);
sigemptyset(&sigcatch);
- cutime = cstime = utime = stime = 0;
- cgtime = gtime = 0;
if (lock_task_sighand(task, &flags)) {
- struct signal_struct *sig = task->signal;
-
if (sig->tty) {
struct pid *pgrp = tty_get_pgrp(sig->tty);
tty_pgrp = pid_nr_ns(pgrp, ns);
@@ -444,26 +520,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
num_threads = get_nr_threads(task);
collect_sigign_sigcatch(task, &sigign, &sigcatch);
- cmin_flt = sig->cmin_flt;
- cmaj_flt = sig->cmaj_flt;
- cutime = sig->cutime;
- cstime = sig->cstime;
- cgtime = sig->cgtime;
- rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
+ rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
- /* add up live thread stats at the group level */
if (whole) {
- struct task_struct *t = task;
- do {
- min_flt += t->min_flt;
- maj_flt += t->maj_flt;
- gtime += task_gtime(t);
- } while_each_thread(task, t);
-
- min_flt += sig->min_flt;
- maj_flt += sig->maj_flt;
- thread_group_cputime_adjusted(task, &utime, &stime);
- gtime += sig->gtime;
+ if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED))
+ exit_code = sig->group_exit_code;
}
sid = task_session_nr_ns(task, ns);
@@ -474,11 +535,38 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
}
if (permitted && (!whole || num_threads < 2))
- wchan = get_wchan(task);
- if (!whole) {
+ wchan = !task_is_running(task);
+
+ scoped_guard(rcu) {
+ scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) {
+ cmin_flt = sig->cmin_flt;
+ cmaj_flt = sig->cmaj_flt;
+ cutime = sig->cutime;
+ cstime = sig->cstime;
+ cgtime = sig->cgtime;
+
+ if (whole) {
+ struct task_struct *t;
+
+ min_flt = sig->min_flt;
+ maj_flt = sig->maj_flt;
+ gtime = sig->gtime;
+
+ __for_each_thread(sig, t) {
+ min_flt += t->min_flt;
+ maj_flt += t->maj_flt;
+ gtime += task_gtime(t);
+ }
+ }
+ }
+ }
+
+ if (whole) {
+ thread_group_cputime_adjusted(task, &utime, &stime);
+ } else {
+ task_cputime_adjusted(task, &utime, &stime);
min_flt = task->min_flt;
maj_flt = task->maj_flt;
- task_cputime_adjusted(task, &utime, &stime);
gtime = task_gtime(task);
}
@@ -487,10 +575,15 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
priority = task_prio(task);
nice = task_nice(task);
- /* convert nsec -> ticks */
- start_time = nsec_to_clock_t(task->real_start_time);
+ /* apply timens offset for boottime and convert nsec -> ticks */
+ start_time =
+ nsec_to_clock_t(timens_add_boottime_ns(task->start_boottime));
- seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);
+ seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns));
+ seq_puts(m, " (");
+ proc_task_name(m, task, false);
+ seq_puts(m, ") ");
+ seq_putc(m, state);
seq_put_decimal_ll(m, " ", ppid);
seq_put_decimal_ll(m, " ", pgid);
seq_put_decimal_ll(m, " ", sid);
@@ -534,10 +627,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
*
* This works with older implementations of procps as well.
*/
- if (wchan)
- seq_puts(m, " 1");
- else
- seq_puts(m, " 0");
+ seq_put_decimal_ull(m, " ", wchan);
seq_put_decimal_ull(m, " ", 0);
seq_put_decimal_ull(m, " ", 0);
@@ -561,7 +651,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
seq_puts(m, " 0 0 0 0 0 0 0");
if (permitted)
- seq_put_decimal_ll(m, " ", task->exit_code);
+ seq_put_decimal_ll(m, " ", exit_code);
else
seq_puts(m, " 0");
@@ -586,28 +676,35 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
struct mm_struct *mm = get_task_mm(task);
if (mm) {
+ unsigned long size;
+ unsigned long resident = 0;
+ unsigned long shared = 0;
+ unsigned long text = 0;
+ unsigned long data = 0;
+
size = task_statm(mm, &shared, &text, &data, &resident);
mmput(mm);
- }
- /*
- * For quick read, open code by putting numbers directly
- * expected format is
- * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
- * size, resident, shared, text, data);
- */
- seq_put_decimal_ull(m, "", size);
- seq_put_decimal_ull(m, " ", resident);
- seq_put_decimal_ull(m, " ", shared);
- seq_put_decimal_ull(m, " ", text);
- seq_put_decimal_ull(m, " ", 0);
- seq_put_decimal_ull(m, " ", data);
- seq_put_decimal_ull(m, " ", 0);
- seq_putc(m, '\n');
+ /*
+ * For quick read, open code by putting numbers directly
+ * expected format is
+ * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
+ * size, resident, shared, text, data);
+ */
+ seq_put_decimal_ull(m, "", size);
+ seq_put_decimal_ull(m, " ", resident);
+ seq_put_decimal_ull(m, " ", shared);
+ seq_put_decimal_ull(m, " ", text);
+ seq_put_decimal_ull(m, " ", 0);
+ seq_put_decimal_ull(m, " ", data);
+ seq_put_decimal_ull(m, " ", 0);
+ seq_putc(m, '\n');
+ } else {
+ seq_write(m, "0 0 0 0 0 0 0\n", 14);
+ }
return 0;
}
@@ -670,25 +767,22 @@ out:
static int children_seq_show(struct seq_file *seq, void *v)
{
- struct inode *inode = seq->private;
- pid_t pid;
-
- pid = pid_nr_ns(v, inode->i_sb->s_fs_info);
- seq_printf(seq, "%d ", pid);
+ struct inode *inode = file_inode(seq->file);
+ seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode->i_sb)));
return 0;
}
static void *children_seq_start(struct seq_file *seq, loff_t *pos)
{
- return get_children_pid(seq->private, NULL, *pos);
+ return get_children_pid(file_inode(seq->file), NULL, *pos);
}
static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct pid *pid;
- pid = get_children_pid(seq->private, v, *pos + 1);
+ pid = get_children_pid(file_inode(seq->file), v, *pos + 1);
put_pid(v);
++*pos;
@@ -709,29 +803,13 @@ static const struct seq_operations children_seq_ops = {
static int children_seq_open(struct inode *inode, struct file *file)
{
- struct seq_file *m;
- int ret;
-
- ret = seq_open(file, &children_seq_ops);
- if (ret)
- return ret;
-
- m = file->private_data;
- m->private = inode;
-
- return ret;
-}
-
-int children_seq_release(struct inode *inode, struct file *file)
-{
- seq_release(inode, file);
- return 0;
+ return seq_open(file, &children_seq_ops);
}
const struct file_operations proc_tid_children_operations = {
.open = children_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = children_seq_release,
+ .release = seq_release,
};
#endif /* CONFIG_PROC_CHILDREN */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 719c2e943ea1..4eec684baca9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/proc/base.c
*
@@ -57,7 +58,7 @@
#include <linux/init.h>
#include <linux/capability.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
+#include <linux/generic-radix-tree.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/namei.h>
@@ -72,8 +73,8 @@
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/ptrace.h>
-#include <linux/tracehook.h>
#include <linux/printk.h>
+#include <linux/cache.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/audit.h>
@@ -83,6 +84,7 @@
#include <linux/elf.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
+#include <linux/fs_parser.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
@@ -90,15 +92,18 @@
#include <linux/sched/coredump.h>
#include <linux/sched/debug.h>
#include <linux/sched/stat.h>
-#include <linux/flex_array.h>
#include <linux/posix-timers.h>
-#ifdef CONFIG_HARDWALL
-#include <asm/hardwall.h>
-#endif
+#include <linux/time_namespace.h>
+#include <linux/resctrl.h>
+#include <linux/cn_proc.h>
+#include <linux/ksm.h>
+#include <uapi/linux/lsm.h>
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"
+#include "../../lib/kstrtox.h"
+
/* NOTE:
* Implementing inode permission operations in /proc is almost
* certainly an error. Permission checks need to happen during
@@ -109,8 +114,42 @@
* in /proc for a task before it execs a suid executable.
*/
-static u8 nlink_tid;
-static u8 nlink_tgid;
+static u8 nlink_tid __ro_after_init;
+static u8 nlink_tgid __ro_after_init;
+
+enum proc_mem_force {
+ PROC_MEM_FORCE_ALWAYS,
+ PROC_MEM_FORCE_PTRACE,
+ PROC_MEM_FORCE_NEVER
+};
+
+static enum proc_mem_force proc_mem_force_override __ro_after_init =
+ IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER :
+ IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE :
+ PROC_MEM_FORCE_ALWAYS;
+
+static const struct constant_table proc_mem_force_table[] __initconst = {
+ { "always", PROC_MEM_FORCE_ALWAYS },
+ { "ptrace", PROC_MEM_FORCE_PTRACE },
+ { "never", PROC_MEM_FORCE_NEVER },
+ { }
+};
+
+static int __init early_proc_mem_force_override(char *buf)
+{
+ if (!buf)
+ return -EINVAL;
+
+ /*
+ * lookup_constant() defaults to proc_mem_force_override to preseve
+ * the initial Kconfig choice in case an invalid param gets passed.
+ */
+ proc_mem_force_override = lookup_constant(proc_mem_force_table,
+ buf, proc_mem_force_override);
+
+ return 0;
+}
+early_param("proc_mem.force_override", early_proc_mem_force_override);
struct pid_entry {
const char *name;
@@ -139,9 +178,13 @@ struct pid_entry {
#define REG(NAME, MODE, fops) \
NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
#define ONE(NAME, MODE, show) \
- NOD(NAME, (S_IFREG|(MODE)), \
+ NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_single_file_operations, \
{ .proc_show = show } )
+#define ATTR(LSMID, NAME, MODE) \
+ NOD(NAME, (S_IFREG|(MODE)), \
+ NULL, &proc_pid_attr_operations, \
+ { .lsmid = LSMID })
/*
* Count the number of hardlinks for the pid_entry table, excluding the .
@@ -204,171 +247,165 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
return result;
}
-static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
- size_t _count, loff_t *pos)
+/*
+ * If the user used setproctitle(), we just get the string from
+ * user space at arg_start, and limit it to a maximum of one page.
+ */
+static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
+ size_t count, unsigned long pos,
+ unsigned long arg_start)
{
- struct task_struct *tsk;
- struct mm_struct *mm;
char *page;
- unsigned long count = _count;
- unsigned long arg_start, arg_end, env_start, env_end;
- unsigned long len1, len2, len;
- unsigned long p;
- char c;
- ssize_t rv;
-
- BUG_ON(*pos < 0);
+ int ret, got;
- tsk = get_proc_task(file_inode(file));
- if (!tsk)
- return -ESRCH;
- mm = get_task_mm(tsk);
- put_task_struct(tsk);
- if (!mm)
+ if (pos >= PAGE_SIZE)
return 0;
- /* Check if process spawned far enough to have cmdline. */
- if (!mm->env_end) {
- rv = 0;
- goto out_mmput;
- }
- page = (char *)__get_free_page(GFP_TEMPORARY);
- if (!page) {
- rv = -ENOMEM;
- goto out_mmput;
+ page = (char *)__get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ ret = 0;
+ got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON);
+ if (got > 0) {
+ int len = strnlen(page, got);
+
+ /* Include the NUL character if it was found */
+ if (len < got)
+ len++;
+
+ if (len > pos) {
+ len -= pos;
+ if (len > count)
+ len = count;
+ len -= copy_to_user(buf, page+pos, len);
+ if (!len)
+ len = -EFAULT;
+ ret = len;
+ }
}
+ free_page((unsigned long)page);
+ return ret;
+}
+
+static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned long arg_start, arg_end, env_start, env_end;
+ unsigned long pos, len;
+ char *page, c;
- down_read(&mm->mmap_sem);
+ /* Check if process spawned far enough to have cmdline. */
+ if (!mm->env_end)
+ return 0;
+
+ spin_lock(&mm->arg_lock);
arg_start = mm->arg_start;
arg_end = mm->arg_end;
env_start = mm->env_start;
env_end = mm->env_end;
- up_read(&mm->mmap_sem);
+ spin_unlock(&mm->arg_lock);
+
+ if (arg_start >= arg_end)
+ return 0;
- BUG_ON(arg_start > arg_end);
- BUG_ON(env_start > env_end);
+ /*
+ * We allow setproctitle() to overwrite the argument
+ * strings, and overflow past the original end. But
+ * only when it overflows into the environment area.
+ */
+ if (env_start != arg_end || env_end < env_start)
+ env_start = env_end = arg_end;
+ len = env_end - arg_start;
- len1 = arg_end - arg_start;
- len2 = env_end - env_start;
+ /* We're not going to care if "*ppos" has high bits set */
+ pos = *ppos;
+ if (pos >= len)
+ return 0;
+ if (count > len - pos)
+ count = len - pos;
+ if (!count)
+ return 0;
- /* Empty ARGV. */
- if (len1 == 0) {
- rv = 0;
- goto out_free_page;
- }
/*
- * Inherently racy -- command line shares address space
- * with code and data.
+ * Magical special case: if the argv[] end byte is not
+ * zero, the user has overwritten it with setproctitle(3).
+ *
+ * Possible future enhancement: do this only once when
+ * pos is 0, and set a flag in the 'struct file'.
*/
- rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0);
- if (rv <= 0)
- goto out_free_page;
-
- rv = 0;
-
- if (c == '\0') {
- /* Command line (set of strings) occupies whole ARGV. */
- if (len1 <= *pos)
- goto out_free_page;
-
- p = arg_start + *pos;
- len = len1 - *pos;
- while (count > 0 && len > 0) {
- unsigned int _count;
- int nr_read;
-
- _count = min3(count, len, PAGE_SIZE);
- nr_read = access_remote_vm(mm, p, page, _count, 0);
- if (nr_read < 0)
- rv = nr_read;
- if (nr_read <= 0)
- goto out_free_page;
-
- if (copy_to_user(buf, page, nr_read)) {
- rv = -EFAULT;
- goto out_free_page;
- }
+ if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c)
+ return get_mm_proctitle(mm, buf, count, pos, arg_start);
- p += nr_read;
- len -= nr_read;
- buf += nr_read;
- count -= nr_read;
- rv += nr_read;
- }
- } else {
- /*
- * Command line (1 string) occupies ARGV and
- * extends into ENVP.
- */
- struct {
- unsigned long p;
- unsigned long len;
- } cmdline[2] = {
- { .p = arg_start, .len = len1 },
- { .p = env_start, .len = len2 },
- };
- loff_t pos1 = *pos;
- unsigned int i;
-
- i = 0;
- while (i < 2 && pos1 >= cmdline[i].len) {
- pos1 -= cmdline[i].len;
- i++;
- }
- while (i < 2) {
- p = cmdline[i].p + pos1;
- len = cmdline[i].len - pos1;
- while (count > 0 && len > 0) {
- unsigned int _count, l;
- int nr_read;
- bool final;
-
- _count = min3(count, len, PAGE_SIZE);
- nr_read = access_remote_vm(mm, p, page, _count, 0);
- if (nr_read < 0)
- rv = nr_read;
- if (nr_read <= 0)
- goto out_free_page;
-
- /*
- * Command line can be shorter than whole ARGV
- * even if last "marker" byte says it is not.
- */
- final = false;
- l = strnlen(page, nr_read);
- if (l < nr_read) {
- nr_read = l;
- final = true;
- }
-
- if (copy_to_user(buf, page, nr_read)) {
- rv = -EFAULT;
- goto out_free_page;
- }
-
- p += nr_read;
- len -= nr_read;
- buf += nr_read;
- count -= nr_read;
- rv += nr_read;
-
- if (final)
- goto out_free_page;
- }
+ /*
+ * For the non-setproctitle() case we limit things strictly
+ * to the [arg_start, arg_end[ range.
+ */
+ pos += arg_start;
+ if (pos < arg_start || pos >= arg_end)
+ return 0;
+ if (count > arg_end - pos)
+ count = arg_end - pos;
+
+ page = (char *)__get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ len = 0;
+ while (count) {
+ int got;
+ size_t size = min_t(size_t, PAGE_SIZE, count);
- /* Only first chunk can be read partially. */
- pos1 = 0;
- i++;
+ got = access_remote_vm(mm, pos, page, size, FOLL_ANON);
+ if (got <= 0)
+ break;
+ got -= copy_to_user(buf, page, got);
+ if (unlikely(!got)) {
+ if (!len)
+ len = -EFAULT;
+ break;
}
+ pos += got;
+ buf += got;
+ len += got;
+ count -= got;
}
-out_free_page:
free_page((unsigned long)page);
-out_mmput:
+ return len;
+}
+
+static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct mm_struct *mm;
+ ssize_t ret;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ ret = get_mm_cmdline(mm, buf, count, pos);
mmput(mm);
- if (rv > 0)
- *pos += rv;
- return rv;
+ return ret;
+}
+
+static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct task_struct *tsk;
+ ssize_t ret;
+
+ BUG_ON(*pos < 0);
+
+ tsk = get_proc_task(file_inode(file));
+ if (!tsk)
+ return -ESRCH;
+ ret = get_task_cmdline(tsk, buf, count, pos);
+ put_task_struct(tsk);
+ if (ret > 0)
+ *pos += ret;
+ return ret;
}
static const struct file_operations proc_pid_cmdline_ops = {
@@ -379,7 +416,7 @@ static const struct file_operations proc_pid_cmdline_ops = {
#ifdef CONFIG_KALLSYMS
/*
* Provides a wchan file via kallsyms in a proper one-value-per-file format.
- * Returns the resolved symbol. If that fails, simply return the address.
+ * Returns the resolved symbol to user space.
*/
static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
@@ -387,25 +424,28 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
unsigned long wchan;
char symname[KSYM_NAME_LEN];
- wchan = get_wchan(task);
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ goto print0;
- if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)
- && !lookup_symbol_name(wchan, symname))
- seq_printf(m, "%s", symname);
- else
- seq_putc(m, '0');
+ wchan = get_wchan(task);
+ if (wchan && !lookup_symbol_name(wchan, symname)) {
+ seq_puts(m, symname);
+ return 0;
+ }
+print0:
+ seq_putc(m, '0');
return 0;
}
#endif /* CONFIG_KALLSYMS */
static int lock_trace(struct task_struct *task)
{
- int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+ int err = down_read_killable(&task->signal->exec_update_lock);
if (err)
return err;
if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
- mutex_unlock(&task->signal->cred_guard_mutex);
+ up_read(&task->signal->exec_update_lock);
return -EPERM;
}
return 0;
@@ -413,7 +453,7 @@ static int lock_trace(struct task_struct *task)
static void unlock_trace(struct task_struct *task)
{
- mutex_unlock(&task->signal->cred_guard_mutex);
+ up_read(&task->signal->exec_update_lock);
}
#ifdef CONFIG_STACKTRACE
@@ -423,28 +463,39 @@ static void unlock_trace(struct task_struct *task)
static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- struct stack_trace trace;
unsigned long *entries;
int err;
- int i;
- entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
+ /*
+ * The ability to racily run the kernel stack unwinder on a running task
+ * and then observe the unwinder output is scary; while it is useful for
+ * debugging kernel issues, it can also allow an attacker to leak kernel
+ * stack contents.
+ * Doing this in a manner that is at least safe from races would require
+ * some work to ensure that the remote task can not be scheduled; and
+ * even then, this would still expose the unwinder as local attack
+ * surface.
+ * Therefore, this interface is restricted to root.
+ */
+ if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
+ return -EACCES;
+
+ entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
+ GFP_KERNEL);
if (!entries)
return -ENOMEM;
- trace.nr_entries = 0;
- trace.max_entries = MAX_STACK_TRACE_DEPTH;
- trace.entries = entries;
- trace.skip = 0;
-
err = lock_trace(task);
if (!err) {
- save_stack_trace_tsk(task, &trace);
+ unsigned int i, nr_entries;
+
+ nr_entries = stack_trace_save_tsk(task, entries,
+ MAX_STACK_TRACE_DEPTH, 0);
- for (i = 0; i < trace.nr_entries; i++) {
- seq_printf(m, "[<%pK>] %pB\n",
- (void *)entries[i], (void *)entries[i]);
+ for (i = 0; i < nr_entries; i++) {
+ seq_printf(m, "[<0>] %pB\n", (void *)entries[i]);
}
+
unlock_trace(task);
}
kfree(entries);
@@ -461,7 +512,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
if (unlikely(!sched_info_on()))
- seq_printf(m, "0 0 0\n");
+ seq_puts(m, "0 0 0\n");
else
seq_printf(m, "%llu %llu %lu\n",
(unsigned long long)task->se.sum_exec_runtime,
@@ -482,7 +533,7 @@ static int lstats_show_proc(struct seq_file *m, void *v)
if (!task)
return -ESRCH;
seq_puts(m, "Latency Top version : v0.1\n");
- for (i = 0; i < 32; i++) {
+ for (i = 0; i < LT_SAVECOUNT; i++) {
struct latency_record *lr = &task->latency_record[i];
if (lr->backtrace[0]) {
int q;
@@ -490,10 +541,9 @@ static int lstats_show_proc(struct seq_file *m, void *v)
lr->count, lr->time, lr->max);
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
unsigned long bt = lr->backtrace[q];
+
if (!bt)
break;
- if (bt == ULONG_MAX)
- break;
seq_printf(m, " %ps", (void *)bt);
}
seq_putc(m, '\n');
@@ -516,7 +566,7 @@ static ssize_t lstats_write(struct file *file, const char __user *buf,
if (!task)
return -ESRCH;
- clear_all_latency_tracing(task);
+ clear_tsk_latency_tracing(task);
put_task_struct(task);
return count;
@@ -535,11 +585,19 @@ static const struct file_operations proc_lstats_operations = {
static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- unsigned long totalpages = totalram_pages + total_swap_pages;
+ unsigned long totalpages = totalram_pages() + total_swap_pages;
unsigned long points = 0;
+ long badness;
+
+ badness = oom_badness(task, totalpages);
+ /*
+ * Special case OOM_SCORE_ADJ_MIN for all others scale the
+ * badness value into [0, 2000] range which we have been
+ * exporting for a long time so userspace might depend on it.
+ */
+ if (badness != LONG_MIN)
+ points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3;
- points = oom_badness(task, NULL, NULL, totalpages) *
- 1000 / totalpages;
seq_printf(m, "%lu\n", points);
return 0;
@@ -586,8 +644,10 @@ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
/*
* print the file header
*/
- seq_printf(m, "%-25s %-20s %-20s %-10s\n",
- "Limit", "Soft Limit", "Hard Limit", "Units");
+ seq_puts(m, "Limit "
+ "Soft Limit "
+ "Hard Limit "
+ "Units \n");
for (i = 0; i < RLIM_NLIMITS; i++) {
if (rlim[i].rlim_cur == RLIM_INFINITY)
@@ -615,24 +675,25 @@ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- long nr;
- unsigned long args[6], sp, pc;
+ struct syscall_info info;
+ u64 *args = &info.data.args[0];
int res;
res = lock_trace(task);
if (res)
return res;
- if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
+ if (task_current_syscall(task, &info))
seq_puts(m, "running\n");
- else if (nr < 0)
- seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+ else if (info.data.nr < 0)
+ seq_printf(m, "%d 0x%llx 0x%llx\n",
+ info.data.nr, info.sp, info.data.instruction_pointer);
else
seq_printf(m,
- "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
- nr,
+ "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n",
+ info.data.nr,
args[0], args[1], args[2], args[3], args[4], args[5],
- sp, pc);
+ info.sp, info.data.instruction_pointer);
unlock_trace(task);
return 0;
@@ -644,10 +705,10 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
/************************************************************************/
/* permission checks */
-static int proc_fd_access_allowed(struct inode *inode)
+static bool proc_fd_access_allowed(struct inode *inode)
{
struct task_struct *task;
- int allowed = 0;
+ bool allowed = false;
/* Allow access to a task's file descriptors if it is us or we
* may use ptrace attach to the process and find out that
* information.
@@ -660,7 +721,8 @@ static int proc_fd_access_allowed(struct inode *inode)
return allowed;
}
-int proc_setattr(struct dentry *dentry, struct iattr *attr)
+int proc_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
{
int error;
struct inode *inode = d_inode(dentry);
@@ -668,12 +730,11 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid & ATTR_MODE)
return -EPERM;
- error = setattr_prepare(dentry, attr);
+ error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
if (error)
return error;
- setattr_copy(inode, attr);
- mark_inode_dirty(inode);
+ setattr_copy(&nop_mnt_idmap, inode, attr);
return 0;
}
@@ -681,32 +742,41 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
* May current process learn task's sched/cmdline info (for hide_pid_min=1)
* or euid/egid (for hide_pid_min=2)?
*/
-static bool has_pid_permissions(struct pid_namespace *pid,
+static bool has_pid_permissions(struct proc_fs_info *fs_info,
struct task_struct *task,
- int hide_pid_min)
+ enum proc_hidepid hide_pid_min)
{
- if (pid->hide_pid < hide_pid_min)
+ /*
+ * If 'hidpid' mount option is set force a ptrace check,
+ * we indicate that we are using a filesystem syscall
+ * by passing PTRACE_MODE_READ_FSCREDS
+ */
+ if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE)
+ return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
+
+ if (fs_info->hide_pid < hide_pid_min)
return true;
- if (in_group_p(pid->pid_gid))
+ if (in_group_p(fs_info->pid_gid))
return true;
return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
}
-static int proc_pid_permission(struct inode *inode, int mask)
+static int proc_pid_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask)
{
- struct pid_namespace *pid = inode->i_sb->s_fs_info;
+ struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
struct task_struct *task;
bool has_perms;
task = get_proc_task(inode);
if (!task)
return -ESRCH;
- has_perms = has_pid_permissions(pid, task, HIDEPID_NO_ACCESS);
+ has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS);
put_task_struct(task);
if (!has_perms) {
- if (pid->hide_pid == HIDEPID_INVISIBLE) {
+ if (fs_info->hide_pid == HIDEPID_INVISIBLE) {
/*
* Let's make getdents(), stat(), and open()
* consistent with each other. If a process
@@ -718,7 +788,7 @@ static int proc_pid_permission(struct inode *inode, int mask)
return -EPERM;
}
- return generic_permission(inode, mask);
+ return generic_permission(&nop_mnt_idmap, inode, mask);
}
@@ -730,13 +800,11 @@ static const struct inode_operations proc_def_inode_operations = {
static int proc_single_show(struct seq_file *m, void *v)
{
struct inode *inode = m->private;
- struct pid_namespace *ns;
- struct pid *pid;
+ struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
+ struct pid *pid = proc_pid(inode);
struct task_struct *task;
int ret;
- ns = inode->i_sb->s_fs_info;
- pid = proc_pid(inode);
task = get_pid_task(pid, PIDTYPE_PID);
if (!task)
return -ESRCH;
@@ -759,23 +827,31 @@ static const struct file_operations proc_single_file_operations = {
.release = single_release,
};
-
+/*
+ * proc_mem_open() can return errno, NULL or mm_struct*.
+ *
+ * - Returns NULL if the task has no mm (PF_KTHREAD or PF_EXITING)
+ * - Returns mm_struct* on success
+ * - Returns error code on failure
+ */
struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
{
struct task_struct *task = get_proc_task(inode);
- struct mm_struct *mm = ERR_PTR(-ESRCH);
+ struct mm_struct *mm;
- if (task) {
- mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
- put_task_struct(task);
+ if (!task)
+ return ERR_PTR(-ESRCH);
- if (!IS_ERR_OR_NULL(mm)) {
- /* ensure this mm_struct can't be freed */
- mmgrab(mm);
- /* but do not pin its memory */
- mmput(mm);
- }
- }
+ mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
+ put_task_struct(task);
+
+ if (IS_ERR(mm))
+ return mm == ERR_PTR(-ESRCH) ? NULL : mm;
+
+ /* ensure this mm_struct can't be freed */
+ mmgrab(mm);
+ /* but do not pin its memory */
+ mmput(mm);
return mm;
}
@@ -784,8 +860,8 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
{
struct mm_struct *mm = proc_mem_open(inode, mode);
- if (IS_ERR(mm))
- return PTR_ERR(mm);
+ if (IS_ERR_OR_NULL(mm))
+ return mm ? PTR_ERR(mm) : -ESRCH;
file->private_data = mm;
return 0;
@@ -793,12 +869,31 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
static int mem_open(struct inode *inode, struct file *file)
{
- int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
-
- /* OK to pass negative loff_t, we can catch out-of-range */
- file->f_mode |= FMODE_UNSIGNED_OFFSET;
+ if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET)))
+ return -EINVAL;
+ return __mem_open(inode, file, PTRACE_MODE_ATTACH);
+}
- return ret;
+static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm)
+{
+ struct task_struct *task;
+ bool ptrace_active = false;
+
+ switch (proc_mem_force_override) {
+ case PROC_MEM_FORCE_NEVER:
+ return false;
+ case PROC_MEM_FORCE_PTRACE:
+ task = get_proc_task(file_inode(file));
+ if (task) {
+ ptrace_active = READ_ONCE(task->ptrace) &&
+ READ_ONCE(task->mm) == mm &&
+ READ_ONCE(task->parent) == current;
+ put_task_struct(task);
+ }
+ return ptrace_active;
+ default:
+ return true;
+ }
}
static ssize_t mem_rw(struct file *file, char __user *buf,
@@ -813,7 +908,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
if (!mm)
return 0;
- page = (char *)__get_free_page(GFP_TEMPORARY);
+ page = (char *)__get_free_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
@@ -821,10 +916,12 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
if (!mmget_not_zero(mm))
goto free;
- flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
+ flags = write ? FOLL_WRITE : 0;
+ if (proc_mem_foll_force(file, mm))
+ flags |= FOLL_FORCE;
while (count > 0) {
- int this_len = min_t(int, count, PAGE_SIZE);
+ size_t this_len = min_t(size_t, count, PAGE_SIZE);
if (write && copy_from_user(page, buf, this_len)) {
copied = -EFAULT;
@@ -898,6 +995,7 @@ static const struct file_operations proc_mem_operations = {
.write = mem_write,
.open = mem_open,
.release = mem_release,
+ .fop_flags = FOP_UNSIGNED_OFFSET,
};
static int environ_open(struct inode *inode, struct file *file)
@@ -918,7 +1016,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
if (!mm || !mm->env_end)
return 0;
- page = (char *)__get_free_page(GFP_TEMPORARY);
+ page = (char *)__get_free_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
@@ -926,10 +1024,10 @@ static ssize_t environ_read(struct file *file, char __user *buf,
if (!mmget_not_zero(mm))
goto free;
- down_read(&mm->mmap_sem);
+ spin_lock(&mm->arg_lock);
env_start = mm->env_start;
env_end = mm->env_end;
- up_read(&mm->mmap_sem);
+ spin_unlock(&mm->arg_lock);
while (count > 0) {
size_t this_len, max_len;
@@ -943,7 +1041,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
max_len = min_t(size_t, PAGE_SIZE, count);
this_len = min(max_len, this_len);
- retval = access_remote_vm(mm, (env_start + src), page, this_len, 0);
+ retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON);
if (retval <= 0) {
ret = retval;
@@ -1018,13 +1116,14 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
OOM_SCORE_ADJ_MAX;
put_task_struct(task);
+ if (oom_adj > OOM_ADJUST_MAX)
+ oom_adj = OOM_ADJUST_MAX;
len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
return simple_read_from_buffer(buf, count, ppos, buffer, len);
}
static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
{
- static DEFINE_MUTEX(oom_adj_mutex);
struct mm_struct *mm = NULL;
struct task_struct *task;
int err = 0;
@@ -1064,7 +1163,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
struct task_struct *p = find_lock_task_mm(task);
if (p) {
- if (atomic_read(&p->mm->mm_users) > 1) {
+ if (mm_flags_test(MMF_MULTIPROCESS, p->mm)) {
mm = p->mm;
mmgrab(mm);
}
@@ -1091,10 +1190,6 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
task_lock(p);
if (!p->vfork_done && process_shares_mm(p, mm)) {
- pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n",
- task_pid_nr(p), p->comm,
- p->signal->oom_score_adj, oom_adj,
- task_pid_nr(task), task->comm);
p->signal->oom_score_adj = oom_adj;
if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
p->signal->oom_score_adj_min = (short)oom_adj;
@@ -1123,11 +1218,10 @@ err_unlock:
static ssize_t oom_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int oom_adj;
int err;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count)) {
@@ -1183,11 +1277,10 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int oom_score_adj;
int err;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count)) {
@@ -1215,7 +1308,7 @@ static const struct file_operations proc_oom_score_adj_operations = {
.llseek = default_llseek,
};
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
#define TMPBUFLEN 11
static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
@@ -1242,6 +1335,10 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
kuid_t kloginuid;
int rv;
+ /* Don't let kthreads write their own loginuid */
+ if (current->flags & PF_KTHREAD)
+ return -EPERM;
+
rcu_read_lock();
if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
rcu_read_unlock();
@@ -1324,13 +1421,13 @@ static ssize_t proc_fault_inject_write(struct file * file,
const char __user * buf, size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int make_it_fail;
int rv;
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- memset(buffer, 0, sizeof(buffer));
+
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
@@ -1370,7 +1467,7 @@ static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf,
task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
- WRITE_ONCE(task->fail_nth, n);
+ task->fail_nth = n;
put_task_struct(task);
return count;
@@ -1386,12 +1483,9 @@ static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
- len = snprintf(numbuf, sizeof(numbuf), "%u\n",
- READ_ONCE(task->fail_nth));
- len = simple_read_from_buffer(buf, count, ppos, numbuf, len);
+ len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth);
put_task_struct(task);
-
- return len;
+ return simple_read_from_buffer(buf, count, ppos, numbuf, len);
}
static const struct file_operations proc_fail_nth_operations = {
@@ -1401,19 +1495,19 @@ static const struct file_operations proc_fail_nth_operations = {
#endif
-#ifdef CONFIG_SCHED_DEBUG
/*
* Print out various scheduling related per-task fields:
*/
static int sched_show(struct seq_file *m, void *v)
{
struct inode *inode = m->private;
+ struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
struct task_struct *p;
p = get_proc_task(inode);
if (!p)
return -ESRCH;
- proc_sched_show_task(p, m);
+ proc_sched_show_task(p, ns, m);
put_task_struct(p);
@@ -1450,8 +1544,6 @@ static const struct file_operations proc_pid_sched_operations = {
.release = single_release,
};
-#endif
-
#ifdef CONFIG_SCHED_AUTOGROUP
/*
* Print out autogroup related information:
@@ -1477,11 +1569,10 @@ sched_autogroup_write(struct file *file, const char __user *buf,
{
struct inode *inode = file_inode(file);
struct task_struct *p;
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int nice;
int err;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
@@ -1527,15 +1618,116 @@ static const struct file_operations proc_pid_sched_autogroup_operations = {
#endif /* CONFIG_SCHED_AUTOGROUP */
+#ifdef CONFIG_TIME_NS
+static int timens_offsets_show(struct seq_file *m, void *v)
+{
+ struct task_struct *p;
+
+ p = get_proc_task(file_inode(m->file));
+ if (!p)
+ return -ESRCH;
+ proc_timens_show_offsets(p, m);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static ssize_t timens_offsets_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct inode *inode = file_inode(file);
+ struct proc_timens_offset offsets[2];
+ char *kbuf = NULL, *pos, *next_line;
+ struct task_struct *p;
+ int ret, noffsets;
+
+ /* Only allow < page size writes at the beginning of the file */
+ if ((*ppos != 0) || (count >= PAGE_SIZE))
+ return -EINVAL;
+
+ /* Slurp in the user data */
+ kbuf = memdup_user_nul(buf, count);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+
+ /* Parse the user data */
+ ret = -EINVAL;
+ noffsets = 0;
+ for (pos = kbuf; pos; pos = next_line) {
+ struct proc_timens_offset *off = &offsets[noffsets];
+ char clock[10];
+ int err;
+
+ /* Find the end of line and ensure we don't look past it */
+ next_line = strchr(pos, '\n');
+ if (next_line) {
+ *next_line = '\0';
+ next_line++;
+ if (*next_line == '\0')
+ next_line = NULL;
+ }
+
+ err = sscanf(pos, "%9s %lld %lu", clock,
+ &off->val.tv_sec, &off->val.tv_nsec);
+ if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC)
+ goto out;
+
+ clock[sizeof(clock) - 1] = 0;
+ if (strcmp(clock, "monotonic") == 0 ||
+ strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0)
+ off->clockid = CLOCK_MONOTONIC;
+ else if (strcmp(clock, "boottime") == 0 ||
+ strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0)
+ off->clockid = CLOCK_BOOTTIME;
+ else
+ goto out;
+
+ noffsets++;
+ if (noffsets == ARRAY_SIZE(offsets)) {
+ if (next_line)
+ count = next_line - kbuf;
+ break;
+ }
+ }
+
+ ret = -ESRCH;
+ p = get_proc_task(inode);
+ if (!p)
+ goto out;
+ ret = proc_timens_set_offset(file, p, offsets, noffsets);
+ put_task_struct(p);
+ if (ret)
+ goto out;
+
+ ret = count;
+out:
+ kfree(kbuf);
+ return ret;
+}
+
+static int timens_offsets_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, timens_offsets_show, inode);
+}
+
+static const struct file_operations proc_timens_offsets_operations = {
+ .open = timens_offsets_open,
+ .read = seq_read,
+ .write = timens_offsets_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_TIME_NS */
+
static ssize_t comm_write(struct file *file, const char __user *buf,
size_t count, loff_t *offset)
{
struct inode *inode = file_inode(file);
struct task_struct *p;
- char buffer[TASK_COMM_LEN];
+ char buffer[TASK_COMM_LEN] = {};
const size_t maxlen = sizeof(buffer) - 1;
- memset(buffer, 0, sizeof(buffer));
if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
return -EFAULT;
@@ -1543,8 +1735,10 @@ static ssize_t comm_write(struct file *file, const char __user *buf,
if (!p)
return -ESRCH;
- if (same_thread_group(current, p))
+ if (same_thread_group(current, p)) {
set_task_comm(p, buffer);
+ proc_comm_connector(p);
+ }
else
count = -EINVAL;
@@ -1562,9 +1756,8 @@ static int comm_show(struct seq_file *m, void *v)
if (!p)
return -ESRCH;
- task_lock(p);
- seq_printf(m, "%s\n", p->comm);
- task_unlock(p);
+ proc_task_name(m, p, false);
+ seq_putc(m, '\n');
put_task_struct(p);
@@ -1621,33 +1814,32 @@ static const char *proc_pid_get_link(struct dentry *dentry,
if (error)
goto out;
- nd_jump_link(&path);
- return NULL;
+ error = nd_jump_link(&path);
out:
return ERR_PTR(error);
}
-static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
+static int do_proc_readlink(const struct path *path, char __user *buffer, int buflen)
{
- char *tmp = (char*)__get_free_page(GFP_TEMPORARY);
+ char *tmp = kmalloc(PATH_MAX, GFP_KERNEL);
char *pathname;
int len;
if (!tmp)
return -ENOMEM;
- pathname = d_path(path, tmp, PAGE_SIZE);
+ pathname = d_path(path, tmp, PATH_MAX);
len = PTR_ERR(pathname);
if (IS_ERR(pathname))
goto out;
- len = tmp + PAGE_SIZE - 1 - pathname;
+ len = tmp + PATH_MAX - 1 - pathname;
if (len > buflen)
len = buflen;
if (copy_to_user(buffer, pathname, len))
len = -EFAULT;
out:
- free_page((unsigned long)tmp);
+ kfree(tmp);
return len;
}
@@ -1680,7 +1872,7 @@ const struct inode_operations proc_pid_link_inode_operations = {
/* building an inode */
-void task_dump_owner(struct task_struct *task, mode_t mode,
+void task_dump_owner(struct task_struct *task, umode_t mode,
kuid_t *ruid, kgid_t *rgid)
{
/* Depending on the state of dumpable compute who should own a
@@ -1690,6 +1882,12 @@ void task_dump_owner(struct task_struct *task, mode_t mode,
kuid_t uid;
kgid_t gid;
+ if (unlikely(task->flags & PF_KTHREAD)) {
+ *ruid = GLOBAL_ROOT_UID;
+ *rgid = GLOBAL_ROOT_GID;
+ return;
+ }
+
/* Default to the tasks effective ownership */
rcu_read_lock();
cred = __task_cred(task);
@@ -1732,11 +1930,23 @@ void task_dump_owner(struct task_struct *task, mode_t mode,
*rgid = gid;
}
-struct inode *proc_pid_make_inode(struct super_block * sb,
+void proc_pid_evict_inode(struct proc_inode *ei)
+{
+ struct pid *pid = ei->pid;
+
+ if (S_ISDIR(ei->vfs_inode.i_mode)) {
+ spin_lock(&pid->lock);
+ hlist_del_init_rcu(&ei->sibling_inodes);
+ spin_unlock(&pid->lock);
+ }
+}
+
+struct inode *proc_pid_make_inode(struct super_block *sb,
struct task_struct *task, umode_t mode)
{
struct inode * inode;
struct proc_inode *ei;
+ struct pid *pid;
/* We need a new inode */
@@ -1748,16 +1958,19 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
ei = PROC_I(inode);
inode->i_mode = mode;
inode->i_ino = get_next_ino();
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ simple_inode_init_ts(inode);
inode->i_op = &proc_def_inode_operations;
/*
* grab the reference to task.
*/
- ei->pid = get_task_pid(task, PIDTYPE_PID);
- if (!ei->pid)
+ pid = get_task_pid(task, PIDTYPE_PID);
+ if (!pid)
goto out_unlock;
+ /* Let the pid remember us for quick removal */
+ ei->pid = pid;
+
task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
security_task_to_inode(task, inode);
@@ -1769,21 +1982,54 @@ out_unlock:
return NULL;
}
-int pid_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int query_flags)
+/*
+ * Generating an inode and adding it into @pid->inodes, so that task will
+ * invalidate inode's dentry before being released.
+ *
+ * This helper is used for creating dir-type entries under '/proc' and
+ * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>'
+ * can be released by invalidating '/proc/<tgid>' dentry.
+ * In theory, dentries under '/proc/<tgid>/task' can also be released by
+ * invalidating '/proc/<tgid>' dentry, we reserve it to handle single
+ * thread exiting situation: Any one of threads should invalidate its
+ * '/proc/<tgid>/task/<pid>' dentry before released.
+ */
+static struct inode *proc_pid_make_base_inode(struct super_block *sb,
+ struct task_struct *task, umode_t mode)
+{
+ struct inode *inode;
+ struct proc_inode *ei;
+ struct pid *pid;
+
+ inode = proc_pid_make_inode(sb, task, mode);
+ if (!inode)
+ return NULL;
+
+ /* Let proc_flush_pid find this directory inode */
+ ei = PROC_I(inode);
+ pid = ei->pid;
+ spin_lock(&pid->lock);
+ hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
+ spin_unlock(&pid->lock);
+
+ return inode;
+}
+
+int pid_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
+ struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
struct task_struct *task;
- struct pid_namespace *pid = path->dentry->d_sb->s_fs_info;
- generic_fillattr(inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
- rcu_read_lock();
stat->uid = GLOBAL_ROOT_UID;
stat->gid = GLOBAL_ROOT_GID;
+ rcu_read_lock();
task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task) {
- if (!has_pid_permissions(pid, task, HIDEPID_INVISIBLE)) {
+ if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) {
rcu_read_unlock();
/*
* This doesn't prevent learning whether PID exists,
@@ -1800,34 +2046,41 @@ int pid_getattr(const struct path *path, struct kstat *stat,
/* dentry stuff */
/*
- * Exceptional case: normally we are not allowed to unhash a busy
- * directory. In this case, however, we can do it - no aliasing problems
- * due to the way we treat inodes.
- *
+ * Set <pid>/... inode ownership (can change due to setuid(), etc.)
+ */
+void pid_update_inode(struct task_struct *task, struct inode *inode)
+{
+ task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
+
+ inode->i_mode &= ~(S_ISUID | S_ISGID);
+ security_task_to_inode(task, inode);
+}
+
+/*
* Rewrite the inode's ownerships here because the owning task may have
* performed a setuid(), etc.
*
*/
-int pid_revalidate(struct dentry *dentry, unsigned int flags)
+static int pid_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
struct task_struct *task;
+ int ret = 0;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
- inode = d_inode(dentry);
- task = get_proc_task(inode);
+ rcu_read_lock();
+ inode = d_inode_rcu(dentry);
+ if (!inode)
+ goto out;
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task) {
- task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
-
- inode->i_mode &= ~(S_ISUID | S_ISGID);
- security_task_to_inode(task, inode);
- put_task_struct(task);
- return 1;
+ pid_update_inode(task, inode);
+ ret = 1;
}
- return 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
static inline bool proc_inode_is_dead(struct inode *inode)
@@ -1859,33 +2112,36 @@ const struct dentry_operations pid_dentry_operations =
* file type from dcache entry.
*
* Since all of the proc inode numbers are dynamically generated, the inode
- * numbers do not exist until the inode is cache. This means creating the
+ * numbers do not exist until the inode is cache. This means creating
* the dcache entry in readdir is necessary to keep the inode numbers
* reported by readdir in sync with the inode numbers reported
* by stat.
*/
bool proc_fill_cache(struct file *file, struct dir_context *ctx,
- const char *name, int len,
+ const char *name, unsigned int len,
instantiate_t instantiate, struct task_struct *task, const void *ptr)
{
struct dentry *child, *dir = file->f_path.dentry;
struct qstr qname = QSTR_INIT(name, len);
struct inode *inode;
- unsigned type;
- ino_t ino;
+ unsigned type = DT_UNKNOWN;
+ ino_t ino = 1;
- child = d_hash_and_lookup(dir, &qname);
+ child = try_lookup_noperm(&qname, dir);
if (!child) {
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
child = d_alloc_parallel(dir, &qname, &wq);
if (IS_ERR(child))
goto end_instantiate;
if (d_in_lookup(child)) {
- int err = instantiate(d_inode(dir), child, task, ptr);
+ struct dentry *res;
+ res = instantiate(child, task, ptr);
d_lookup_done(child);
- if (err < 0) {
+ if (unlikely(res)) {
dput(child);
- goto end_instantiate;
+ child = res;
+ if (IS_ERR(child))
+ goto end_instantiate;
}
}
}
@@ -1893,10 +2149,8 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
ino = inode->i_ino;
type = inode->i_mode >> 12;
dput(child);
- return dir_emit(ctx, name, len, ino, type);
-
end_instantiate:
- return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
+ return dir_emit(ctx, name, len, ino, type);
}
/*
@@ -1906,13 +2160,43 @@ end_instantiate:
static int dname_to_vma_addr(struct dentry *dentry,
unsigned long *start, unsigned long *end)
{
- if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+ const char *str = dentry->d_name.name;
+ unsigned long long sval, eval;
+ unsigned int len;
+
+ if (str[0] == '0' && str[1] != '-')
return -EINVAL;
+ len = _parse_integer(str, 16, &sval);
+ if (len & KSTRTOX_OVERFLOW)
+ return -EINVAL;
+ if (sval != (unsigned long)sval)
+ return -EINVAL;
+ str += len;
+
+ if (*str != '-')
+ return -EINVAL;
+ str++;
+
+ if (str[0] == '0' && str[1])
+ return -EINVAL;
+ len = _parse_integer(str, 16, &eval);
+ if (len & KSTRTOX_OVERFLOW)
+ return -EINVAL;
+ if (eval != (unsigned long)eval)
+ return -EINVAL;
+ str += len;
+
+ if (*str != '\0')
+ return -EINVAL;
+
+ *start = sval;
+ *end = eval;
return 0;
}
-static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int map_files_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
unsigned long vm_start, vm_end;
bool exact_vma_exists = false;
@@ -1930,13 +2214,16 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
goto out_notask;
mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
- if (IS_ERR_OR_NULL(mm))
+ if (IS_ERR(mm))
goto out;
if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
- down_read(&mm->mmap_sem);
- exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
- up_read(&mm->mmap_sem);
+ status = mmap_read_lock_killable(mm);
+ if (!status) {
+ exact_vma_exists = !!find_exact_vma(mm, vm_start,
+ vm_end);
+ mmap_read_unlock(mm);
+ }
}
mmput(mm);
@@ -1982,15 +2269,18 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
if (rc)
goto out_mmput;
+ rc = mmap_read_lock_killable(mm);
+ if (rc)
+ goto out_mmput;
+
rc = -ENOENT;
- down_read(&mm->mmap_sem);
vma = find_exact_vma(mm, vm_start, vm_end);
if (vma && vma->vm_file) {
- *path = vma->vm_file->f_path;
+ *path = *file_user_path(vma->vm_file);
path_get(path);
rc = 0;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
out_mmput:
mmput(mm);
@@ -1999,22 +2289,22 @@ out:
}
struct map_files_info {
+ unsigned long start;
+ unsigned long end;
fmode_t mode;
- unsigned int len;
- unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
};
/*
- * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
- * symlinks may be used to bypass permissions on ancestor directories in the
- * path to the file in question.
+ * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due
+ * to concerns about how the symlinks may be used to bypass permissions on
+ * ancestor directories in the path to the file in question.
*/
static const char *
proc_map_files_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- if (!capable(CAP_SYS_ADMIN))
+ if (!checkpoint_restore_ns_capable(&init_user_ns))
return ERR_PTR(-EPERM);
return proc_pid_get_link(dentry, inode, done);
@@ -2029,19 +2319,19 @@ static const struct inode_operations proc_map_files_link_inode_operations = {
.setattr = proc_setattr,
};
-static int
-proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+static struct dentry *
+proc_map_files_instantiate(struct dentry *dentry,
struct task_struct *task, const void *ptr)
{
fmode_t mode = (fmode_t)(unsigned long)ptr;
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK |
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK |
((mode & FMODE_READ ) ? S_IRUSR : 0) |
((mode & FMODE_WRITE) ? S_IWUSR : 0));
if (!inode)
- return -ENOENT;
+ return ERR_PTR(-ENOENT);
ei = PROC_I(inode);
ei->op.proc_get_link = map_files_get_link;
@@ -2049,10 +2339,8 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
- d_set_d_op(dentry, &tid_map_files_dentry_operations);
- d_add(dentry, inode);
-
- return 0;
+ return proc_splice_unmountable(inode, dentry,
+ &tid_map_files_dentry_operations);
}
static struct dentry *proc_map_files_lookup(struct inode *dir,
@@ -2061,19 +2349,19 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
unsigned long vm_start, vm_end;
struct vm_area_struct *vma;
struct task_struct *task;
- int result;
+ struct dentry *result;
struct mm_struct *mm;
- result = -ENOENT;
+ result = ERR_PTR(-ENOENT);
task = get_proc_task(dir);
if (!task)
goto out;
- result = -EACCES;
+ result = ERR_PTR(-EACCES);
if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
goto out_put_task;
- result = -ENOENT;
+ result = ERR_PTR(-ENOENT);
if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
goto out_put_task;
@@ -2081,22 +2369,27 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
if (!mm)
goto out_put_task;
- down_read(&mm->mmap_sem);
+ result = ERR_PTR(-EINTR);
+ if (mmap_read_lock_killable(mm))
+ goto out_put_mm;
+
+ result = ERR_PTR(-ENOENT);
vma = find_exact_vma(mm, vm_start, vm_end);
if (!vma)
goto out_no_vma;
if (vma->vm_file)
- result = proc_map_files_instantiate(dir, dentry, task,
+ result = proc_map_files_instantiate(dentry, task,
(void *)(unsigned long)vma->vm_file->f_mode);
out_no_vma:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
+out_put_mm:
mmput(mm);
out_put_task:
put_task_struct(task);
out:
- return ERR_PTR(result);
+ return result;
}
static const struct inode_operations proc_map_files_inode_operations = {
@@ -2112,10 +2405,12 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
struct task_struct *task;
struct mm_struct *mm;
unsigned long nr_files, pos, i;
- struct flex_array *fa = NULL;
- struct map_files_info info;
+ GENRADIX(struct map_files_info) fa;
struct map_files_info *p;
int ret;
+ struct vma_iterator vmi;
+
+ genradix_init(&fa);
ret = -ENOENT;
task = get_proc_task(file_inode(file));
@@ -2133,71 +2428,67 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
mm = get_task_mm(task);
if (!mm)
goto out_put_task;
- down_read(&mm->mmap_sem);
+
+ ret = mmap_read_lock_killable(mm);
+ if (ret) {
+ mmput(mm);
+ goto out_put_task;
+ }
nr_files = 0;
/*
* We need two passes here:
*
- * 1) Collect vmas of mapped files with mmap_sem taken
- * 2) Release mmap_sem and instantiate entries
+ * 1) Collect vmas of mapped files with mmap_lock taken
+ * 2) Release mmap_lock and instantiate entries
*
* otherwise we get lockdep complained, since filldir()
- * routine might require mmap_sem taken in might_fault().
+ * routine might require mmap_lock taken in might_fault().
*/
- for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
- if (vma->vm_file && ++pos > ctx->pos)
- nr_files++;
- }
+ pos = 2;
+ vma_iter_init(&vmi, mm, 0);
+ for_each_vma(vmi, vma) {
+ if (!vma->vm_file)
+ continue;
+ if (++pos <= ctx->pos)
+ continue;
- if (nr_files) {
- fa = flex_array_alloc(sizeof(info), nr_files,
- GFP_KERNEL);
- if (!fa || flex_array_prealloc(fa, 0, nr_files,
- GFP_KERNEL)) {
+ p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
+ if (!p) {
ret = -ENOMEM;
- if (fa)
- flex_array_free(fa);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
mmput(mm);
goto out_put_task;
}
- for (i = 0, vma = mm->mmap, pos = 2; vma;
- vma = vma->vm_next) {
- if (!vma->vm_file)
- continue;
- if (++pos <= ctx->pos)
- continue;
- info.mode = vma->vm_file->f_mode;
- info.len = snprintf(info.name,
- sizeof(info.name), "%lx-%lx",
- vma->vm_start, vma->vm_end);
- if (flex_array_put(fa, i++, &info, GFP_KERNEL))
- BUG();
- }
+ p->start = vma->vm_start;
+ p->end = vma->vm_end;
+ p->mode = vma->vm_file->f_mode;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
+ mmput(mm);
for (i = 0; i < nr_files; i++) {
- p = flex_array_get(fa, i);
+ char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */
+ unsigned int len;
+
+ p = genradix_ptr(&fa, i);
+ len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
if (!proc_fill_cache(file, ctx,
- p->name, p->len,
+ buf, len,
proc_map_files_instantiate,
task,
(void *)(unsigned long)p->mode))
break;
ctx->pos++;
}
- if (fa)
- flex_array_free(fa);
- mmput(mm);
out_put_task:
put_task_struct(task);
out:
+ genradix_free(&fa);
return ret;
}
@@ -2209,11 +2500,9 @@ static const struct file_operations proc_map_files_operations = {
#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
struct timers_private {
- struct pid *pid;
- struct task_struct *task;
- struct sighand_struct *sighand;
- struct pid_namespace *ns;
- unsigned long flags;
+ struct pid *pid;
+ struct task_struct *task;
+ struct pid_namespace *ns;
};
static void *timers_start(struct seq_file *m, loff_t *pos)
@@ -2224,54 +2513,48 @@ static void *timers_start(struct seq_file *m, loff_t *pos)
if (!tp->task)
return ERR_PTR(-ESRCH);
- tp->sighand = lock_task_sighand(tp->task, &tp->flags);
- if (!tp->sighand)
- return ERR_PTR(-ESRCH);
-
- return seq_list_start(&tp->task->signal->posix_timers, *pos);
+ rcu_read_lock();
+ return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos);
}
static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
{
struct timers_private *tp = m->private;
- return seq_list_next(v, &tp->task->signal->posix_timers, pos);
+
+ return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos);
}
static void timers_stop(struct seq_file *m, void *v)
{
struct timers_private *tp = m->private;
- if (tp->sighand) {
- unlock_task_sighand(tp->task, &tp->flags);
- tp->sighand = NULL;
- }
-
if (tp->task) {
put_task_struct(tp->task);
tp->task = NULL;
+ rcu_read_unlock();
}
}
static int show_timer(struct seq_file *m, void *v)
{
- struct k_itimer *timer;
- struct timers_private *tp = m->private;
- int notify;
static const char * const nstr[] = {
- [SIGEV_SIGNAL] = "signal",
- [SIGEV_NONE] = "none",
- [SIGEV_THREAD] = "thread",
+ [SIGEV_SIGNAL] = "signal",
+ [SIGEV_NONE] = "none",
+ [SIGEV_THREAD] = "thread",
};
- timer = list_entry((struct list_head *)v, struct k_itimer, list);
- notify = timer->it_sigev_notify;
+ struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
+ struct timers_private *tp = m->private;
+ int notify = timer->it_sigev_notify;
+
+ guard(spinlock_irq)(&timer->it_lock);
+ if (!posixtimer_valid(timer))
+ return 0;
seq_printf(m, "ID: %d\n", timer->it_id);
- seq_printf(m, "signal: %d/%p\n",
- timer->sigq->info.si_signo,
- timer->sigq->info.si_value.sival_ptr);
- seq_printf(m, "notify: %s/%s.%d\n",
- nstr[notify & ~SIGEV_THREAD_ID],
+ seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo,
+ timer->sigq.info.si_value.sival_ptr);
+ seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID],
(notify & SIGEV_THREAD_ID) ? "tid" : "pid",
pid_nr_ns(timer->it_pid, tp->ns));
seq_printf(m, "ClockID: %d\n", timer->it_clock);
@@ -2296,7 +2579,7 @@ static int proc_timers_open(struct inode *inode, struct file *file)
return -ENOMEM;
tp->pid = proc_pid(inode);
- tp->ns = inode->i_sb->s_fs_info;
+ tp->ns = proc_pid_ns(inode->i_sb);
return 0;
}
@@ -2325,10 +2608,13 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
return -ESRCH;
if (p != current) {
- if (!capable(CAP_SYS_NICE)) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
count = -EPERM;
goto out;
}
+ rcu_read_unlock();
err = security_task_setscheduler(p);
if (err) {
@@ -2338,10 +2624,11 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
}
task_lock(p);
- if (slack_ns == 0)
- p->timer_slack_ns = p->default_timer_slack_ns;
- else
- p->timer_slack_ns = slack_ns;
+ if (rt_or_dl_task_policy(p))
+ slack_ns = 0;
+ else if (slack_ns == 0)
+ slack_ns = p->default_timer_slack_ns;
+ p->timer_slack_ns = slack_ns;
task_unlock(p);
out:
@@ -2361,11 +2648,14 @@ static int timerslack_ns_show(struct seq_file *m, void *v)
return -ESRCH;
if (p != current) {
-
- if (!capable(CAP_SYS_NICE)) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
err = -EPERM;
goto out;
}
+ rcu_read_unlock();
+
err = security_task_getscheduler(p);
if (err)
goto out;
@@ -2394,16 +2684,16 @@ static const struct file_operations proc_pid_set_timerslack_ns_operations = {
.release = single_release,
};
-static int proc_pident_instantiate(struct inode *dir,
- struct dentry *dentry, struct task_struct *task, const void *ptr)
+static struct dentry *proc_pident_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
{
const struct pid_entry *p = ptr;
struct inode *inode;
struct proc_inode *ei;
- inode = proc_pid_make_inode(dir->i_sb, task, p->mode);
+ inode = proc_pid_make_inode(dentry->d_sb, task, p->mode);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOENT);
ei = PROC_I(inode);
if (S_ISDIR(inode->i_mode))
@@ -2413,25 +2703,17 @@ static int proc_pident_instantiate(struct inode *dir,
if (p->fop)
inode->i_fop = p->fop;
ei->op = p->op;
- d_set_d_op(dentry, &pid_dentry_operations);
- d_add(dentry, inode);
- /* Close the race of the process dying before we return the dentry */
- if (pid_revalidate(dentry, 0))
- return 0;
-out:
- return -ENOENT;
+ pid_update_inode(task, inode);
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
static struct dentry *proc_pident_lookup(struct inode *dir,
struct dentry *dentry,
- const struct pid_entry *ents,
- unsigned int nents)
+ const struct pid_entry *p,
+ const struct pid_entry *end)
{
- int error;
struct task_struct *task = get_proc_task(dir);
- const struct pid_entry *p, *last;
-
- error = -ENOENT;
+ struct dentry *res = ERR_PTR(-ENOENT);
if (!task)
goto out_no_task;
@@ -2440,21 +2722,17 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
* Yes, it does not scale. And it should not. Don't add
* new entries into /proc/<tgid>/ without very good reasons.
*/
- last = &ents[nents];
- for (p = ents; p < last; p++) {
+ for (; p < end; p++) {
if (p->len != dentry->d_name.len)
continue;
- if (!memcmp(dentry->d_name.name, p->name, p->len))
+ if (!memcmp(dentry->d_name.name, p->name, p->len)) {
+ res = proc_pident_instantiate(dentry, task, p);
break;
+ }
}
- if (p >= last)
- goto out;
-
- error = proc_pident_instantiate(dir, dentry, task, p);
-out:
put_task_struct(task);
out_no_task:
- return ERR_PTR(error);
+ return res;
}
static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
@@ -2484,6 +2762,13 @@ out:
}
#ifdef CONFIG_SECURITY
+static int proc_pid_attr_open(struct inode *inode, struct file *file)
+{
+ file->private_data = NULL;
+ __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
+ return 0;
+}
+
static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
@@ -2495,8 +2780,8 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
if (!task)
return -ESRCH;
- length = security_getprocattr(task,
- (char*)file->f_path.dentry->d_name.name,
+ length = security_getprocattr(task, PROC_I(inode)->op.lsmid,
+ file->f_path.dentry->d_name.name,
&p);
put_task_struct(task);
if (length > 0)
@@ -2509,62 +2794,128 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
size_t count, loff_t *ppos)
{
struct inode * inode = file_inode(file);
+ struct task_struct *task;
void *page;
- ssize_t length;
- struct task_struct *task = get_proc_task(inode);
+ int rv;
- length = -ESRCH;
- if (!task)
- goto out_no_task;
+ /* A task may only write when it was the opener. */
+ if (file->private_data != current->mm)
+ return -EPERM;
+ rcu_read_lock();
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (!task) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
/* A task may only write its own attributes. */
- length = -EACCES;
- if (current != task)
- goto out;
+ if (current != task) {
+ rcu_read_unlock();
+ return -EACCES;
+ }
+ /* Prevent changes to overridden credentials. */
+ if (current_cred() != current_real_cred()) {
+ rcu_read_unlock();
+ return -EBUSY;
+ }
+ rcu_read_unlock();
if (count > PAGE_SIZE)
count = PAGE_SIZE;
/* No partial writes. */
- length = -EINVAL;
if (*ppos != 0)
- goto out;
+ return -EINVAL;
page = memdup_user(buf, count);
if (IS_ERR(page)) {
- length = PTR_ERR(page);
+ rv = PTR_ERR(page);
goto out;
}
/* Guard against adverse ptrace interaction */
- length = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
- if (length < 0)
+ rv = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
+ if (rv < 0)
goto out_free;
- length = security_setprocattr(file->f_path.dentry->d_name.name,
- page, count);
+ rv = security_setprocattr(PROC_I(inode)->op.lsmid,
+ file->f_path.dentry->d_name.name, page,
+ count);
mutex_unlock(&current->signal->cred_guard_mutex);
out_free:
kfree(page);
out:
- put_task_struct(task);
-out_no_task:
- return length;
+ return rv;
}
static const struct file_operations proc_pid_attr_operations = {
+ .open = proc_pid_attr_open,
.read = proc_pid_attr_read,
.write = proc_pid_attr_write,
.llseek = generic_file_llseek,
+ .release = mem_release,
};
+#define LSM_DIR_OPS(LSM) \
+static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
+ struct dir_context *ctx) \
+{ \
+ return proc_pident_readdir(filp, ctx, \
+ LSM##_attr_dir_stuff, \
+ ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+} \
+\
+static const struct file_operations proc_##LSM##_attr_dir_ops = { \
+ .read = generic_read_dir, \
+ .iterate_shared = proc_##LSM##_attr_dir_iterate, \
+ .llseek = default_llseek, \
+}; \
+\
+static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
+ struct dentry *dentry, unsigned int flags) \
+{ \
+ return proc_pident_lookup(dir, dentry, \
+ LSM##_attr_dir_stuff, \
+ LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+} \
+\
+static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
+ .lookup = proc_##LSM##_attr_dir_lookup, \
+ .getattr = pid_getattr, \
+ .setattr = proc_setattr, \
+}
+
+#ifdef CONFIG_SECURITY_SMACK
+static const struct pid_entry smack_attr_dir_stuff[] = {
+ ATTR(LSM_ID_SMACK, "current", 0666),
+};
+LSM_DIR_OPS(smack);
+#endif
+
+#ifdef CONFIG_SECURITY_APPARMOR
+static const struct pid_entry apparmor_attr_dir_stuff[] = {
+ ATTR(LSM_ID_APPARMOR, "current", 0666),
+ ATTR(LSM_ID_APPARMOR, "prev", 0444),
+ ATTR(LSM_ID_APPARMOR, "exec", 0666),
+};
+LSM_DIR_OPS(apparmor);
+#endif
+
static const struct pid_entry attr_dir_stuff[] = {
- REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("prev", S_IRUGO, proc_pid_attr_operations),
- REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+ ATTR(LSM_ID_UNDEF, "current", 0666),
+ ATTR(LSM_ID_UNDEF, "prev", 0444),
+ ATTR(LSM_ID_UNDEF, "exec", 0666),
+ ATTR(LSM_ID_UNDEF, "fscreate", 0666),
+ ATTR(LSM_ID_UNDEF, "keycreate", 0666),
+ ATTR(LSM_ID_UNDEF, "sockcreate", 0666),
+#ifdef CONFIG_SECURITY_SMACK
+ DIR("smack", 0555,
+ proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
+#endif
+#ifdef CONFIG_SECURITY_APPARMOR
+ DIR("apparmor", 0555,
+ proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops),
+#endif
};
static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
@@ -2583,7 +2934,8 @@ static struct dentry *proc_attr_dir_lookup(struct inode *dir,
struct dentry *dentry, unsigned int flags)
{
return proc_pident_lookup(dir, dentry,
- attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
+ attr_dir_stuff,
+ attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff));
}
static const struct inode_operations proc_attr_dir_inode_operations = {
@@ -2610,8 +2962,10 @@ static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
ret = 0;
mm = get_task_mm(task);
if (mm) {
+ unsigned long flags = __mm_flags_get_dumpable(mm);
+
len = snprintf(buffer, sizeof(buffer), "%08lx\n",
- ((mm->flags & MMF_DUMP_FILTER_MASK) >>
+ ((flags & MMF_DUMP_FILTER_MASK) >>
MMF_DUMP_FILTER_SHIFT));
mmput(mm);
ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
@@ -2650,9 +3004,9 @@ static ssize_t proc_coredump_filter_write(struct file *file,
for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
if (val & mask)
- set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+ mm_flags_set(i + MMF_DUMP_FILTER_SHIFT, mm);
else
- clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+ mm_flags_clear(i + MMF_DUMP_FILTER_SHIFT, mm);
}
mmput(mm);
@@ -2674,11 +3028,10 @@ static const struct file_operations proc_coredump_filter_operations = {
#ifdef CONFIG_TASK_IO_ACCOUNTING
static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
{
- struct task_io_accounting acct = task->ioac;
- unsigned long flags;
+ struct task_io_accounting acct;
int result;
- result = mutex_lock_killable(&task->signal->cred_guard_mutex);
+ result = down_read_killable(&task->signal->exec_update_lock);
if (result)
return result;
@@ -2687,15 +3040,21 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
goto out_unlock;
}
- if (whole && lock_task_sighand(task, &flags)) {
- struct task_struct *t = task;
+ if (whole) {
+ struct signal_struct *sig = task->signal;
+ struct task_struct *t;
- task_io_accounting_add(&acct, &task->signal->ioac);
- while_each_thread(task, t)
- task_io_accounting_add(&acct, &t->ioac);
+ guard(rcu)();
+ scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) {
+ acct = sig->ioac;
+ __for_each_thread(sig, t)
+ task_io_accounting_add(&acct, &t->ioac);
- unlock_task_sighand(task, &flags);
+ }
+ } else {
+ acct = task->ioac;
}
+
seq_printf(m,
"rchar: %llu\n"
"wchar: %llu\n"
@@ -2714,7 +3073,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
result = 0;
out_unlock:
- mutex_unlock(&task->signal->cred_guard_mutex);
+ up_read(&task->signal->exec_update_lock);
return result;
}
@@ -2883,6 +3242,64 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
}
#endif /* CONFIG_LIVEPATCH */
+#ifdef CONFIG_KSM
+static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct mm_struct *mm;
+
+ mm = get_task_mm(task);
+ if (mm) {
+ seq_printf(m, "%lu\n", mm->ksm_merging_pages);
+ mmput(mm);
+ }
+
+ return 0;
+}
+static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct mm_struct *mm;
+ int ret = 0;
+
+ mm = get_task_mm(task);
+ if (mm) {
+ seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
+ seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm));
+ seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
+ seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
+ seq_printf(m, "ksm_merge_any: %s\n",
+ mm_flags_test(MMF_VM_MERGE_ANY, mm) ? "yes" : "no");
+ ret = mmap_read_lock_killable(mm);
+ if (ret) {
+ mmput(mm);
+ return ret;
+ }
+ seq_printf(m, "ksm_mergeable: %s\n",
+ ksm_process_mergeable(mm) ? "yes" : "no");
+ mmap_read_unlock(mm);
+ mmput(mm);
+ }
+
+ return 0;
+}
+#endif /* CONFIG_KSM */
+
+#ifdef CONFIG_KSTACK_ERASE_METRICS
+static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ unsigned long prev_depth = THREAD_SIZE -
+ (task->prev_lowest_stack & (THREAD_SIZE - 1));
+ unsigned long depth = THREAD_SIZE -
+ (task->lowest_stack & (THREAD_SIZE - 1));
+
+ seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n",
+ prev_depth, depth);
+ return 0;
+}
+#endif /* CONFIG_KSTACK_ERASE_METRICS */
+
/*
* Thread groups
*/
@@ -2893,7 +3310,7 @@ static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
- DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
@@ -2903,12 +3320,13 @@ static const struct pid_entry tgid_base_stuff[] = {
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
ONE("limits", S_IRUGO, proc_pid_limits),
-#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
-#endif
#ifdef CONFIG_SCHED_AUTOGROUP
REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
#endif
+#ifdef CONFIG_TIME_NS
+ REG("timens_offsets", S_IRUGO|S_IWUSR, proc_timens_offsets_operations),
+#endif
REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
ONE("syscall", S_IRUSR, proc_pid_syscall),
@@ -2930,6 +3348,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
REG("smaps", S_IRUGO, proc_pid_smaps_operations),
+ REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
REG("pagemap", S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
@@ -2953,10 +3372,13 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_CGROUPS
ONE("cgroup", S_IRUGO, proc_cgroup_show),
#endif
+#ifdef CONFIG_PROC_CPU_RESCTRL
+ ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
+#endif
ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
REG("sessionid", S_IRUGO, proc_sessionid_operations),
#endif
@@ -2970,9 +3392,6 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_TASK_IO_ACCOUNTING
ONE("io", S_IRUSR, proc_tgid_io_accounting),
#endif
-#ifdef CONFIG_HARDWALL
- ONE("hardwall", S_IRUGO, proc_pid_hardwall),
-#endif
#ifdef CONFIG_USER_NS
REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
@@ -2986,6 +3405,19 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_LIVEPATCH
ONE("patch_state", S_IRUSR, proc_pid_patch_state),
#endif
+#ifdef CONFIG_KSTACK_ERASE_METRICS
+ ONE("stack_depth", S_IRUGO, proc_stack_depth),
+#endif
+#ifdef CONFIG_PROC_PID_ARCH_STATUS
+ ONE("arch_status", S_IRUGO, proc_pid_arch_status),
+#endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+ ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
+#ifdef CONFIG_KSM
+ ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
+ ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat),
+#endif
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3000,10 +3432,19 @@ static const struct file_operations proc_tgid_base_operations = {
.llseek = generic_file_llseek,
};
+struct pid *tgid_pidfd_to_pid(const struct file *file)
+{
+ if (file->f_op != &proc_tgid_base_operations)
+ return ERR_PTR(-EBADF);
+
+ return proc_pid(file_inode(file));
+}
+
static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
return proc_pident_lookup(dir, dentry,
- tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
+ tgid_base_stuff,
+ tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff));
}
static const struct inode_operations proc_tgid_base_inode_operations = {
@@ -3013,130 +3454,64 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
.permission = proc_pid_permission,
};
-static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
-{
- struct dentry *dentry, *leader, *dir;
- char buf[PROC_NUMBUF];
- struct qstr name;
-
- name.name = buf;
- name.len = snprintf(buf, sizeof(buf), "%d", pid);
- /* no ->d_hash() rejects on procfs */
- dentry = d_hash_and_lookup(mnt->mnt_root, &name);
- if (dentry) {
- d_invalidate(dentry);
- dput(dentry);
- }
-
- if (pid == tgid)
- return;
-
- name.name = buf;
- name.len = snprintf(buf, sizeof(buf), "%d", tgid);
- leader = d_hash_and_lookup(mnt->mnt_root, &name);
- if (!leader)
- goto out;
-
- name.name = "task";
- name.len = strlen(name.name);
- dir = d_hash_and_lookup(leader, &name);
- if (!dir)
- goto out_put_leader;
-
- name.name = buf;
- name.len = snprintf(buf, sizeof(buf), "%d", pid);
- dentry = d_hash_and_lookup(dir, &name);
- if (dentry) {
- d_invalidate(dentry);
- dput(dentry);
- }
-
- dput(dir);
-out_put_leader:
- dput(leader);
-out:
- return;
-}
-
/**
- * proc_flush_task - Remove dcache entries for @task from the /proc dcache.
- * @task: task that should be flushed.
- *
- * When flushing dentries from proc, one needs to flush them from global
- * proc (proc_mnt) and from all the namespaces' procs this task was seen
- * in. This call is supposed to do all of this job.
+ * proc_flush_pid - Remove dcache entries for @pid from the /proc dcache.
+ * @pid: pid that should be flushed.
*
- * Looks in the dcache for
- * /proc/@pid
- * /proc/@tgid/task/@pid
- * if either directory is present flushes it and all of it'ts children
- * from the dcache.
+ * This function walks a list of inodes (that belong to any proc
+ * filesystem) that are attached to the pid and flushes them from
+ * the dentry cache.
*
* It is safe and reasonable to cache /proc entries for a task until
* that task exits. After that they just clog up the dcache with
* useless entries, possibly causing useful dcache entries to be
- * flushed instead. This routine is proved to flush those useless
- * dcache entries at process exit time.
+ * flushed instead. This routine is provided to flush those useless
+ * dcache entries when a process is reaped.
*
* NOTE: This routine is just an optimization so it does not guarantee
- * that no dcache entries will exist at process exit time it
- * just makes it very unlikely that any will persist.
+ * that no dcache entries will exist after a process is reaped
+ * it just makes it very unlikely that any will persist.
*/
-void proc_flush_task(struct task_struct *task)
+void proc_flush_pid(struct pid *pid)
{
- int i;
- struct pid *pid, *tgid;
- struct upid *upid;
-
- pid = task_pid(task);
- tgid = task_tgid(task);
-
- for (i = 0; i <= pid->level; i++) {
- upid = &pid->numbers[i];
- proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
- tgid->numbers[i].nr);
- }
+ proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock);
}
-static int proc_pid_instantiate(struct inode *dir,
- struct dentry * dentry,
+static struct dentry *proc_pid_instantiate(struct dentry * dentry,
struct task_struct *task, const void *ptr)
{
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+ inode = proc_pid_make_base_inode(dentry->d_sb, task,
+ S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOENT);
inode->i_op = &proc_tgid_base_inode_operations;
inode->i_fop = &proc_tgid_base_operations;
inode->i_flags|=S_IMMUTABLE;
set_nlink(inode, nlink_tgid);
+ pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
-
- d_add(dentry, inode);
- /* Close the race of the process dying before we return the dentry */
- if (pid_revalidate(dentry, 0))
- return 0;
-out:
- return -ENOENT;
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
+struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
{
- int result = -ENOENT;
struct task_struct *task;
unsigned tgid;
+ struct proc_fs_info *fs_info;
struct pid_namespace *ns;
+ struct dentry *result = ERR_PTR(-ENOENT);
tgid = name_to_int(&dentry->d_name);
if (tgid == ~0U)
goto out;
- ns = dentry->d_sb->s_fs_info;
+ fs_info = proc_sb_info(dentry->d_sb);
+ ns = fs_info->pid_ns;
rcu_read_lock();
task = find_task_by_pid_ns(tgid, ns);
if (task)
@@ -3145,10 +3520,17 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsign
if (!task)
goto out;
- result = proc_pid_instantiate(dir, dentry, task, NULL);
+ /* Limit procfs to only ptraceable tasks */
+ if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) {
+ if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS))
+ goto out_put_task;
+ }
+
+ result = proc_pid_instantiate(dentry, task, NULL);
+out_put_task:
put_task_struct(task);
out:
- return ERR_PTR(result);
+ return result;
}
/*
@@ -3171,20 +3553,8 @@ retry:
pid = find_ge_pid(iter.tgid, ns);
if (pid) {
iter.tgid = pid_nr_ns(pid, ns);
- iter.task = pid_task(pid, PIDTYPE_PID);
- /* What we to know is if the pid we have find is the
- * pid of a thread_group_leader. Testing for task
- * being a thread_group_leader is the obvious thing
- * todo but there is a window when it fails, due to
- * the pid transfer logic in de_thread.
- *
- * So we perform the straight forward test of seeing
- * if the pid we have found is the pid of a thread
- * group leader, and don't worry if the task we have
- * found doesn't happen to be a thread group leader.
- * As we don't care in the case of readdir.
- */
- if (!iter.task || !has_group_leader_pid(iter.task)) {
+ iter.task = pid_task(pid, PIDTYPE_TGID);
+ if (!iter.task) {
iter.tgid += 1;
goto retry;
}
@@ -3200,21 +3570,20 @@ retry:
int proc_pid_readdir(struct file *file, struct dir_context *ctx)
{
struct tgid_iter iter;
- struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info;
+ struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb);
+ struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb);
loff_t pos = ctx->pos;
if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
return 0;
if (pos == TGID_OFFSET - 2) {
- struct inode *inode = d_inode(ns->proc_self);
- if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
+ if (!dir_emit(ctx, "self", 4, self_inum, DT_LNK))
return 0;
ctx->pos = pos = pos + 1;
}
if (pos == TGID_OFFSET - 1) {
- struct inode *inode = d_inode(ns->proc_thread_self);
- if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
+ if (!dir_emit(ctx, "thread-self", 11, thread_self_inum, DT_LNK))
return 0;
ctx->pos = pos = pos + 1;
}
@@ -3223,14 +3592,14 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
for (iter = next_tgid(ns, iter);
iter.task;
iter.tgid += 1, iter = next_tgid(ns, iter)) {
- char name[PROC_NUMBUF];
- int len;
+ char name[10 + 1];
+ unsigned int len;
cond_resched();
- if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE))
+ if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE))
continue;
- len = snprintf(name, sizeof(name), "%d", iter.tgid);
+ len = snprintf(name, sizeof(name), "%u", iter.tgid);
ctx->pos = iter.tgid + TGID_OFFSET;
if (!proc_fill_cache(file, ctx, name, len,
proc_pid_instantiate, iter.task, NULL)) {
@@ -3254,7 +3623,8 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
* This function makes sure that the node is always accessible for members of
* same thread group.
*/
-static int proc_tid_comm_permission(struct inode *inode, int mask)
+static int proc_tid_comm_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask)
{
bool is_same_tgroup;
struct task_struct *task;
@@ -3273,11 +3643,12 @@ static int proc_tid_comm_permission(struct inode *inode, int mask)
return 0;
}
- return generic_permission(inode, mask);
+ return generic_permission(&nop_mnt_idmap, inode, mask);
}
static const struct inode_operations proc_tid_comm_inode_operations = {
- .permission = proc_tid_comm_permission,
+ .setattr = proc_setattr,
+ .permission = proc_tid_comm_permission,
};
/*
@@ -3285,7 +3656,7 @@ static const struct inode_operations proc_tid_comm_inode_operations = {
*/
static const struct pid_entry tid_base_stuff[] = {
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
- DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
@@ -3295,9 +3666,7 @@ static const struct pid_entry tid_base_stuff[] = {
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
ONE("limits", S_IRUGO, proc_pid_limits),
-#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
-#endif
NOD("comm", S_IFREG|S_IRUGO|S_IWUSR,
&proc_tid_comm_inode_operations,
&proc_pid_set_comm_operations, {}),
@@ -3307,12 +3676,12 @@ static const struct pid_entry tid_base_stuff[] = {
REG("cmdline", S_IRUGO, proc_pid_cmdline_ops),
ONE("stat", S_IRUGO, proc_tid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
- REG("maps", S_IRUGO, proc_tid_maps_operations),
+ REG("maps", S_IRUGO, proc_pid_maps_operations),
#ifdef CONFIG_PROC_CHILDREN
REG("children", S_IRUGO, proc_tid_children_operations),
#endif
#ifdef CONFIG_NUMA
- REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations),
+ REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
#endif
REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
LNK("cwd", proc_cwd_link),
@@ -3322,7 +3691,8 @@ static const struct pid_entry tid_base_stuff[] = {
REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
- REG("smaps", S_IRUGO, proc_tid_smaps_operations),
+ REG("smaps", S_IRUGO, proc_pid_smaps_operations),
+ REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
REG("pagemap", S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
@@ -3346,10 +3716,13 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_CGROUPS
ONE("cgroup", S_IRUGO, proc_cgroup_show),
#endif
+#ifdef CONFIG_PROC_CPU_RESCTRL
+ ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
+#endif
ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
REG("sessionid", S_IRUGO, proc_sessionid_operations),
#endif
@@ -3360,9 +3733,6 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_TASK_IO_ACCOUNTING
ONE("io", S_IRUSR, proc_tid_io_accounting),
#endif
-#ifdef CONFIG_HARDWALL
- ONE("hardwall", S_IRUGO, proc_pid_hardwall),
-#endif
#ifdef CONFIG_USER_NS
REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
@@ -3372,6 +3742,16 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_LIVEPATCH
ONE("patch_state", S_IRUSR, proc_pid_patch_state),
#endif
+#ifdef CONFIG_PROC_PID_ARCH_STATUS
+ ONE("arch_status", S_IRUGO, proc_pid_arch_status),
+#endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+ ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
+#ifdef CONFIG_KSM
+ ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
+ ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat),
+#endif
};
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3383,7 +3763,8 @@ static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
return proc_pident_lookup(dir, dentry,
- tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
+ tid_base_stuff,
+ tid_base_stuff + ARRAY_SIZE(tid_base_stuff));
}
static const struct file_operations proc_tid_base_operations = {
@@ -3398,37 +3779,33 @@ static const struct inode_operations proc_tid_base_inode_operations = {
.setattr = proc_setattr,
};
-static int proc_task_instantiate(struct inode *dir,
- struct dentry *dentry, struct task_struct *task, const void *ptr)
+static struct dentry *proc_task_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
{
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
-
+ inode = proc_pid_make_base_inode(dentry->d_sb, task,
+ S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOENT);
+
inode->i_op = &proc_tid_base_inode_operations;
inode->i_fop = &proc_tid_base_operations;
- inode->i_flags|=S_IMMUTABLE;
+ inode->i_flags |= S_IMMUTABLE;
set_nlink(inode, nlink_tid);
+ pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
-
- d_add(dentry, inode);
- /* Close the race of the process dying before we return the dentry */
- if (pid_revalidate(dentry, 0))
- return 0;
-out:
- return -ENOENT;
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
- int result = -ENOENT;
struct task_struct *task;
struct task_struct *leader = get_proc_task(dir);
unsigned tid;
+ struct proc_fs_info *fs_info;
struct pid_namespace *ns;
+ struct dentry *result = ERR_PTR(-ENOENT);
if (!leader)
goto out_no_task;
@@ -3437,7 +3814,8 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
if (tid == ~0U)
goto out;
- ns = dentry->d_sb->s_fs_info;
+ fs_info = proc_sb_info(dentry->d_sb);
+ ns = fs_info->pid_ns;
rcu_read_lock();
task = find_task_by_pid_ns(tid, ns);
if (task)
@@ -3448,13 +3826,13 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
if (!same_thread_group(leader, task))
goto out_drop_task;
- result = proc_task_instantiate(dir, dentry, task, NULL);
+ result = proc_task_instantiate(dentry, task, NULL);
out_drop_task:
put_task_struct(task);
out:
put_task_struct(leader);
out_no_task:
- return ERR_PTR(result);
+ return result;
}
/*
@@ -3497,11 +3875,10 @@ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
/* If we haven't found our starting place yet start
* with the leader and walk nr threads forward.
*/
- pos = task = task->group_leader;
- do {
+ for_each_thread(task, pos) {
if (!nr--)
goto found;
- } while_each_thread(task, pos);
+ }
fail:
pos = NULL;
goto out;
@@ -3523,10 +3900,8 @@ static struct task_struct *next_tid(struct task_struct *start)
struct task_struct *pos = NULL;
rcu_read_lock();
if (pid_alive(start)) {
- pos = next_thread(start);
- if (thread_group_leader(pos))
- pos = NULL;
- else
+ pos = __next_thread(start);
+ if (pos)
get_task_struct(pos);
}
rcu_read_unlock();
@@ -3548,24 +3923,27 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit_dots(file, ctx))
return 0;
- /* f_version caches the tgid value that the last readdir call couldn't
- * return. lseek aka telldir automagically resets f_version to 0.
+ /* We cache the tgid value that the last readdir call couldn't
+ * return and lseek resets it to 0.
*/
- ns = inode->i_sb->s_fs_info;
- tid = (int)file->f_version;
- file->f_version = 0;
+ ns = proc_pid_ns(inode->i_sb);
+ tid = (int)(intptr_t)file->private_data;
+ file->private_data = NULL;
for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
task;
task = next_tid(task), ctx->pos++) {
- char name[PROC_NUMBUF];
- int len;
+ char name[10 + 1];
+ unsigned int len;
+
tid = task_pid_nr_ns(task, ns);
+ if (!tid)
+ continue; /* The task has just exited. */
len = snprintf(name, sizeof(name), "%d", tid);
if (!proc_fill_cache(file, ctx, name, len,
proc_task_instantiate, task, NULL)) {
/* returning this tgid failed, save it as the first
* pid for the next readir call */
- file->f_version = (u64)tid;
+ file->private_data = (void *)(intptr_t)tid;
put_task_struct(task);
break;
}
@@ -3574,12 +3952,13 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
return 0;
}
-static int proc_task_getattr(const struct path *path, struct kstat *stat,
+static int proc_task_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
struct task_struct *p = get_proc_task(inode);
- generic_fillattr(inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (p) {
stat->nlink += get_nr_threads(p);
@@ -3589,6 +3968,24 @@ static int proc_task_getattr(const struct path *path, struct kstat *stat,
return 0;
}
+/*
+ * proc_task_readdir() set @file->private_data to a positive integer
+ * value, so casting that to u64 is safe. generic_llseek_cookie() will
+ * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is
+ * here to catch any unexpected change in behavior either in
+ * proc_task_readdir() or generic_llseek_cookie().
+ */
+static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ u64 cookie = (u64)(intptr_t)file->private_data;
+ loff_t off;
+
+ off = generic_llseek_cookie(file, offset, whence, &cookie);
+ WARN_ON_ONCE(cookie > INT_MAX);
+ file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */
+ return off;
+}
+
static const struct inode_operations proc_task_inode_operations = {
.lookup = proc_task_lookup,
.getattr = proc_task_getattr,
@@ -3599,7 +3996,7 @@ static const struct inode_operations proc_task_inode_operations = {
static const struct file_operations proc_task_operations = {
.read = generic_read_dir,
.iterate_shared = proc_task_readdir,
- .llseek = generic_file_llseek,
+ .llseek = proc_dir_llseek,
};
void __init set_proc_pid_nlink(void)
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
new file mode 100644
index 000000000000..87dcaae32ff8
--- /dev/null
+++ b/fs/proc/bootconfig.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * /proc/bootconfig - Extra boot configuration
+ */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/printk.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/bootconfig.h>
+#include <linux/slab.h>
+
+static char *saved_boot_config;
+
+static int boot_config_proc_show(struct seq_file *m, void *v)
+{
+ if (saved_boot_config)
+ seq_puts(m, saved_boot_config);
+ return 0;
+}
+
+/* Rest size of buffer */
+#define rest(dst, end) ((end) > (dst) ? (end) - (dst) : 0)
+
+/* Return the needed total length if @size is 0 */
+static int __init copy_xbc_key_value_list(char *dst, size_t size)
+{
+ struct xbc_node *leaf, *vnode;
+ char *key, *end = dst + size;
+ const char *val;
+ char q;
+ int ret = 0;
+
+ key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL);
+ if (!key)
+ return -ENOMEM;
+
+ xbc_for_each_key_value(leaf, val) {
+ ret = xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX);
+ if (ret < 0)
+ break;
+ ret = snprintf(dst, rest(dst, end), "%s = ", key);
+ if (ret < 0)
+ break;
+ dst += ret;
+ vnode = xbc_node_get_child(leaf);
+ if (vnode) {
+ xbc_array_for_each_value(vnode, val) {
+ if (strchr(val, '"'))
+ q = '\'';
+ else
+ q = '"';
+ ret = snprintf(dst, rest(dst, end), "%c%s%c%s",
+ q, val, q, xbc_node_is_array(vnode) ? ", " : "\n");
+ if (ret < 0)
+ goto out;
+ dst += ret;
+ }
+ } else {
+ ret = snprintf(dst, rest(dst, end), "\"\"\n");
+ if (ret < 0)
+ break;
+ dst += ret;
+ }
+ }
+ if (cmdline_has_extra_options() && ret >= 0 && boot_command_line[0]) {
+ ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n",
+ boot_command_line);
+ if (ret > 0)
+ dst += ret;
+ }
+out:
+ kfree(key);
+
+ return ret < 0 ? ret : dst - (end - size);
+}
+
+static int __init proc_boot_config_init(void)
+{
+ int len;
+
+ len = copy_xbc_key_value_list(NULL, 0);
+ if (len < 0)
+ return len;
+
+ if (len > 0) {
+ saved_boot_config = kzalloc(len + 1, GFP_KERNEL);
+ if (!saved_boot_config)
+ return -ENOMEM;
+
+ len = copy_xbc_key_value_list(saved_boot_config, len + 1);
+ if (len < 0) {
+ kfree(saved_boot_config);
+ return len;
+ }
+ }
+
+ proc_create_single("bootconfig", 0, NULL, boot_config_proc_show);
+
+ return 0;
+}
+fs_initcall(proc_boot_config_init);
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index cbd82dff7e81..a6f76121955f 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -1,29 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include "internal.h"
static int cmdline_proc_show(struct seq_file *m, void *v)
{
- seq_printf(m, "%s\n", saved_command_line);
+ seq_puts(m, saved_command_line);
+ seq_putc(m, '\n');
return 0;
}
-static int cmdline_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, cmdline_proc_show, NULL);
-}
-
-static const struct file_operations cmdline_proc_fops = {
- .open = cmdline_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int __init proc_cmdline_init(void)
{
- proc_create("cmdline", 0, NULL, &cmdline_proc_fops);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("cmdline", 0, NULL, cmdline_proc_show);
+ pde_make_permanent(pde);
+ pde->size = saved_command_line_len + 1;
return 0;
}
fs_initcall(proc_cmdline_init);
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index 290ba85cb900..b7cab1ad990d 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -1,7 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2010 Werner Fink, Jiri Slaby
- *
- * Licensed under GPLv2
*/
#include <linux/console.h>
@@ -22,6 +21,7 @@ static int show_console_dev(struct seq_file *m, void *v)
{ CON_ENABLED, 'E' },
{ CON_CONSDEV, 'C' },
{ CON_BOOT, 'B' },
+ { CON_NBCON, 'N' },
{ CON_PRINTBUFFER, 'p' },
{ CON_BRL, 'b' },
{ CON_ANYTIME, 'a' },
@@ -34,7 +34,16 @@ static int show_console_dev(struct seq_file *m, void *v)
if (con->device) {
const struct tty_driver *driver;
int index;
+
+ /*
+ * Take console_lock to serialize device() callback with
+ * other console operations. For example, fg_console is
+ * modified under console_lock when switching vt.
+ */
+ console_lock();
driver = con->device(con, &index);
+ console_unlock();
+
if (driver) {
dev = MKDEV(driver->major, driver->minor_start);
dev += index;
@@ -50,22 +59,27 @@ static int show_console_dev(struct seq_file *m, void *v)
seq_printf(m, "%s%d", con->name, con->index);
seq_pad(m, ' ');
seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-',
- con->write ? 'W' : '-', con->unblank ? 'U' : '-',
- flags);
+ ((con->flags & CON_NBCON) || con->write) ? 'W' : '-',
+ con->unblank ? 'U' : '-', flags);
if (dev)
seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
- seq_printf(m, "\n");
-
+ seq_putc(m, '\n');
return 0;
}
static void *c_start(struct seq_file *m, loff_t *pos)
+ __acquires(&console_mutex)
{
struct console *con;
loff_t off = 0;
- console_lock();
+ /*
+ * Hold the console_list_lock to guarantee safe traversal of the
+ * console list. SRCU cannot be used because there is no
+ * place to store the SRCU cookie.
+ */
+ console_list_lock();
for_each_console(con)
if (off++ == *pos)
break;
@@ -76,13 +90,15 @@ static void *c_start(struct seq_file *m, loff_t *pos)
static void *c_next(struct seq_file *m, void *v, loff_t *pos)
{
struct console *con = v;
+
++*pos;
- return con->next;
+ return hlist_entry_safe(con->node.next, struct console, node);
}
static void c_stop(struct seq_file *m, void *v)
+ __releases(&console_mutex)
{
- console_unlock();
+ console_list_unlock();
}
static const struct seq_operations consoles_op = {
@@ -92,21 +108,9 @@ static const struct seq_operations consoles_op = {
.show = show_console_dev
};
-static int consoles_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &consoles_op);
-}
-
-static const struct file_operations proc_consoles_operations = {
- .open = consoles_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static int __init proc_consoles_init(void)
{
- proc_create("consoles", 0, NULL, &proc_consoles_operations);
+ proc_create_seq("consoles", 0, NULL, &consoles_op);
return 0;
}
fs_initcall(proc_consoles_init);
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index 06f4d31e0396..f38bda5b83ec 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -1,24 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpufreq.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
extern const struct seq_operations cpuinfo_op;
+
static int cpuinfo_open(struct inode *inode, struct file *file)
{
return seq_open(file, &cpuinfo_op);
}
-static const struct file_operations proc_cpuinfo_operations = {
- .open = cpuinfo_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
+static const struct proc_ops cpuinfo_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_open = cpuinfo_open,
+ .proc_read_iter = seq_read_iter,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
};
static int __init proc_cpuinfo_init(void)
{
- proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
+ proc_create("cpuinfo", 0, NULL, &cpuinfo_proc_ops);
return 0;
}
fs_initcall(proc_cpuinfo_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 50493edc30e5..fe7bfcb7d049 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -1,20 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/blkdev.h>
+#include "internal.h"
static int devinfo_show(struct seq_file *f, void *v)
{
int i = *(loff_t *) v;
- if (i < CHRDEV_MAJOR_HASH_SIZE) {
+ if (i < CHRDEV_MAJOR_MAX) {
if (i == 0)
seq_puts(f, "Character devices:\n");
chrdev_show(f, i);
}
#ifdef CONFIG_BLOCK
else {
- i -= CHRDEV_MAJOR_HASH_SIZE;
+ i -= CHRDEV_MAJOR_MAX;
if (i == 0)
seq_puts(f, "\nBlock devices:\n");
blkdev_show(f, i);
@@ -25,7 +28,7 @@ static int devinfo_show(struct seq_file *f, void *v)
static void *devinfo_start(struct seq_file *f, loff_t *pos)
{
- if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
+ if (*pos < (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX))
return pos;
return NULL;
}
@@ -33,7 +36,7 @@ static void *devinfo_start(struct seq_file *f, loff_t *pos)
static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos)
{
(*pos)++;
- if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
+ if (*pos >= (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX))
return NULL;
return pos;
}
@@ -50,21 +53,12 @@ static const struct seq_operations devinfo_ops = {
.show = devinfo_show
};
-static int devinfo_open(struct inode *inode, struct file *filp)
-{
- return seq_open(filp, &devinfo_ops);
-}
-
-static const struct file_operations proc_devinfo_operations = {
- .open = devinfo_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static int __init proc_devices_init(void)
{
- proc_create("devices", 0, NULL, &proc_devinfo_operations);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_seq("devices", 0, NULL, &devinfo_ops);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_devices_init);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index c330495c3115..9eeccff49b2a 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/sched/signal.h>
#include <linux/errno.h>
#include <linux/dcache.h>
@@ -5,10 +6,13 @@
#include <linux/fdtable.h>
#include <linux/namei.h>
#include <linux/pid.h>
+#include <linux/ptrace.h>
+#include <linux/bitmap.h>
#include <linux/security.h>
#include <linux/file.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
+#include <linux/filelock.h>
#include <linux/proc_fs.h>
@@ -27,35 +31,35 @@ static int seq_show(struct seq_file *m, void *v)
if (!task)
return -ENOENT;
- files = get_files_struct(task);
- put_task_struct(task);
-
+ task_lock(task);
+ files = task->files;
if (files) {
unsigned int fd = proc_fd(m->private);
spin_lock(&files->file_lock);
- file = fcheck_files(files, fd);
+ file = files_lookup_fd_locked(files, fd);
if (file) {
- struct fdtable *fdt = files_fdtable(files);
-
f_flags = file->f_flags;
- if (close_on_exec(fd, fdt))
+ if (close_on_exec(fd, files))
f_flags |= O_CLOEXEC;
get_file(file);
ret = 0;
}
spin_unlock(&files->file_lock);
- put_files_struct(files);
}
+ task_unlock(task);
+ put_task_struct(task);
if (ret)
return ret;
- seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n",
+ seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%lu\n",
(long long)file->f_pos, f_flags,
- real_mount(file->f_path.mnt)->mnt_id);
+ real_mount(file->f_path.mnt)->mnt_id,
+ file_inode(file)->i_ino);
+ /* show_fd_locks() never dereferences files, so a stale value is safe */
show_fd_locks(m, file, files);
if (seq_has_overflowed(m))
goto out;
@@ -73,6 +77,34 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file)
return single_open(file, seq_show, inode);
}
+/*
+ * Shared /proc/pid/fdinfo and /proc/pid/fdinfo/fd permission helper to ensure
+ * that the current task has PTRACE_MODE_READ in addition to the normal
+ * POSIX-like checks.
+ */
+static int proc_fdinfo_permission(struct mnt_idmap *idmap, struct inode *inode,
+ int mask)
+{
+ bool allowed = false;
+ struct task_struct *task = get_proc_task(inode);
+
+ if (!task)
+ return -ESRCH;
+
+ allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
+ put_task_struct(task);
+
+ if (!allowed)
+ return -EACCES;
+
+ return generic_permission(idmap, inode, mask);
+}
+
+static const struct inode_operations proc_fdinfo_file_inode_operations = {
+ .permission = proc_fdinfo_permission,
+ .setattr = proc_setattr,
+};
+
static const struct file_operations proc_fdinfo_file_operations = {
.open = seq_fdinfo_open,
.read = seq_read,
@@ -80,9 +112,37 @@ static const struct file_operations proc_fdinfo_file_operations = {
.release = single_release,
};
-static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
+static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
+{
+ struct file *file;
+
+ file = fget_task(task, fd);
+ if (file) {
+ *mode = file->f_mode;
+ fput(file);
+ }
+ return !!file;
+}
+
+static void tid_fd_update_inode(struct task_struct *task, struct inode *inode,
+ fmode_t f_mode)
+{
+ task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
+
+ if (S_ISLNK(inode->i_mode)) {
+ unsigned i_mode = S_IFLNK;
+ if (f_mode & FMODE_READ)
+ i_mode |= S_IRUSR | S_IXUSR;
+ if (f_mode & FMODE_WRITE)
+ i_mode |= S_IWUSR | S_IXUSR;
+ inode->i_mode = i_mode;
+ }
+ security_task_to_inode(task, inode);
+}
+
+static int tid_fd_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
- struct files_struct *files;
struct task_struct *task;
struct inode *inode;
unsigned int fd;
@@ -95,35 +155,11 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
fd = proc_fd(inode);
if (task) {
- files = get_files_struct(task);
- if (files) {
- struct file *file;
-
- rcu_read_lock();
- file = fcheck_files(files, fd);
- if (file) {
- unsigned f_mode = file->f_mode;
-
- rcu_read_unlock();
- put_files_struct(files);
-
- task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
-
- if (S_ISLNK(inode->i_mode)) {
- unsigned i_mode = S_IFLNK;
- if (f_mode & FMODE_READ)
- i_mode |= S_IRUSR | S_IXUSR;
- if (f_mode & FMODE_WRITE)
- i_mode |= S_IWUSR | S_IXUSR;
- inode->i_mode = i_mode;
- }
-
- security_task_to_inode(task, inode);
- put_task_struct(task);
- return 1;
- }
- rcu_read_unlock();
- put_files_struct(files);
+ fmode_t f_mode;
+ if (tid_fd_mode(task, fd, &f_mode)) {
+ tid_fd_update_inode(task, inode, f_mode);
+ put_task_struct(task);
+ return 1;
}
put_task_struct(task);
}
@@ -137,62 +173,54 @@ static const struct dentry_operations tid_fd_dentry_operations = {
static int proc_fd_link(struct dentry *dentry, struct path *path)
{
- struct files_struct *files = NULL;
struct task_struct *task;
int ret = -ENOENT;
task = get_proc_task(d_inode(dentry));
if (task) {
- files = get_files_struct(task);
- put_task_struct(task);
- }
-
- if (files) {
unsigned int fd = proc_fd(d_inode(dentry));
struct file *fd_file;
- spin_lock(&files->file_lock);
- fd_file = fcheck_files(files, fd);
+ fd_file = fget_task(task, fd);
if (fd_file) {
*path = fd_file->f_path;
path_get(&fd_file->f_path);
ret = 0;
+ fput(fd_file);
}
- spin_unlock(&files->file_lock);
- put_files_struct(files);
+ put_task_struct(task);
}
return ret;
}
-static int
-proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
- struct task_struct *task, const void *ptr)
+struct fd_data {
+ fmode_t mode;
+ unsigned fd;
+};
+
+static struct dentry *proc_fd_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
{
- unsigned fd = (unsigned long)ptr;
+ const struct fd_data *data = ptr;
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK);
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOENT);
ei = PROC_I(inode);
- ei->fd = fd;
+ ei->fd = data->fd;
inode->i_op = &proc_pid_link_inode_operations;
inode->i_size = 64;
ei->op.proc_get_link = proc_fd_link;
+ tid_fd_update_inode(task, inode, data->mode);
- d_set_d_op(dentry, &tid_fd_dentry_operations);
- d_add(dentry, inode);
-
- /* Close the race of the process dying before we return the dentry */
- if (tid_fd_revalidate(dentry, 0))
- return 0;
- out:
- return -ENOENT;
+ return proc_splice_unmountable(inode, dentry,
+ &tid_fd_dentry_operations);
}
static struct dentry *proc_lookupfd_common(struct inode *dir,
@@ -200,26 +228,27 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
instantiate_t instantiate)
{
struct task_struct *task = get_proc_task(dir);
- int result = -ENOENT;
- unsigned fd = name_to_int(&dentry->d_name);
+ struct fd_data data = {.fd = name_to_int(&dentry->d_name)};
+ struct dentry *result = ERR_PTR(-ENOENT);
if (!task)
goto out_no_task;
- if (fd == ~0U)
+ if (data.fd == ~0U)
+ goto out;
+ if (!tid_fd_mode(task, data.fd, &data.mode))
goto out;
- result = instantiate(dir, dentry, task, (void *)(unsigned long)fd);
+ result = instantiate(dentry, task, &data);
out:
put_task_struct(task);
out_no_task:
- return ERR_PTR(result);
+ return result;
}
static int proc_readfd_common(struct file *file, struct dir_context *ctx,
instantiate_t instantiate)
{
struct task_struct *p = get_proc_task(file_inode(file));
- struct files_struct *files;
unsigned int fd;
if (!p)
@@ -227,45 +256,65 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
if (!dir_emit_dots(file, ctx))
goto out;
- files = get_files_struct(p);
- if (!files)
- goto out;
- rcu_read_lock();
- for (fd = ctx->pos - 2;
- fd < files_fdtable(files)->max_fds;
- fd++, ctx->pos++) {
- char name[PROC_NUMBUF];
- int len;
-
- if (!fcheck_files(files, fd))
- continue;
- rcu_read_unlock();
+ for (fd = ctx->pos - 2;; fd++) {
+ struct file *f;
+ struct fd_data data;
+ char name[10 + 1];
+ unsigned int len;
+
+ f = fget_task_next(p, &fd);
+ ctx->pos = fd + 2LL;
+ if (!f)
+ break;
+ data.mode = f->f_mode;
+ fput(f);
+ data.fd = fd;
len = snprintf(name, sizeof(name), "%u", fd);
if (!proc_fill_cache(file, ctx,
name, len, instantiate, p,
- (void *)(unsigned long)fd))
- goto out_fd_loop;
+ &data))
+ break;
cond_resched();
- rcu_read_lock();
}
- rcu_read_unlock();
-out_fd_loop:
- put_files_struct(files);
out:
put_task_struct(p);
return 0;
}
-static int proc_readfd(struct file *file, struct dir_context *ctx)
+static int proc_readfd_count(struct inode *inode, loff_t *count)
+{
+ struct task_struct *p = get_proc_task(inode);
+ struct fdtable *fdt;
+
+ if (!p)
+ return -ENOENT;
+
+ task_lock(p);
+ if (p->files) {
+ rcu_read_lock();
+
+ fdt = files_fdtable(p->files);
+ *count = bitmap_weight(fdt->open_fds, fdt->max_fds);
+
+ rcu_read_unlock();
+ }
+ task_unlock(p);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static int proc_fd_iterate(struct file *file, struct dir_context *ctx)
{
return proc_readfd_common(file, ctx, proc_fd_instantiate);
}
const struct file_operations proc_fd_operations = {
.read = generic_read_dir,
- .iterate_shared = proc_readfd,
+ .iterate_shared = proc_fd_iterate,
.llseek = generic_file_llseek,
};
@@ -279,12 +328,13 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
* /proc/pid/fd needs a special permission handler so that a process can still
* access /proc/self/fd after it has executed a setuid().
*/
-int proc_fd_permission(struct inode *inode, int mask)
+int proc_fd_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask)
{
struct task_struct *p;
int rv;
- rv = generic_permission(inode, mask);
+ rv = generic_permission(&nop_mnt_idmap, inode, mask);
if (rv == 0)
return rv;
@@ -297,37 +347,44 @@ int proc_fd_permission(struct inode *inode, int mask)
return rv;
}
+static int proc_fd_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ return proc_readfd_count(inode, &stat->size);
+}
+
const struct inode_operations proc_fd_inode_operations = {
.lookup = proc_lookupfd,
.permission = proc_fd_permission,
+ .getattr = proc_fd_getattr,
.setattr = proc_setattr,
};
-static int
-proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
- struct task_struct *task, const void *ptr)
+static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
{
- unsigned fd = (unsigned long)ptr;
+ const struct fd_data *data = ptr;
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task, S_IFREG | S_IRUSR);
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUGO);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOENT);
ei = PROC_I(inode);
- ei->fd = fd;
+ ei->fd = data->fd;
- inode->i_fop = &proc_fdinfo_file_operations;
+ inode->i_op = &proc_fdinfo_file_inode_operations;
- d_set_d_op(dentry, &tid_fd_dentry_operations);
- d_add(dentry, inode);
+ inode->i_fop = &proc_fdinfo_file_operations;
+ tid_fd_update_inode(task, inode, 0);
- /* Close the race of the process dying before we return the dentry */
- if (tid_fd_revalidate(dentry, 0))
- return 0;
- out:
- return -ENOENT;
+ return proc_splice_unmountable(inode, dentry,
+ &tid_fd_dentry_operations);
}
static struct dentry *
@@ -336,7 +393,7 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
}
-static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
+static int proc_fdinfo_iterate(struct file *file, struct dir_context *ctx)
{
return proc_readfd_common(file, ctx,
proc_fdinfo_instantiate);
@@ -344,11 +401,12 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
const struct inode_operations proc_fdinfo_inode_operations = {
.lookup = proc_lookupfdinfo,
+ .permission = proc_fdinfo_permission,
.setattr = proc_setattr,
};
const struct file_operations proc_fdinfo_operations = {
.read = generic_read_dir,
- .iterate_shared = proc_readfdinfo,
+ .iterate_shared = proc_fdinfo_iterate,
.llseek = generic_file_llseek,
};
diff --git a/fs/proc/fd.h b/fs/proc/fd.h
index 46dafadd0083..7e7265f7e06f 100644
--- a/fs/proc/fd.h
+++ b/fs/proc/fd.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __PROCFS_FD_H__
#define __PROCFS_FD_H__
@@ -9,7 +10,8 @@ extern const struct inode_operations proc_fd_inode_operations;
extern const struct file_operations proc_fdinfo_operations;
extern const struct inode_operations proc_fdinfo_inode_operations;
-extern int proc_fd_permission(struct inode *inode, int mask);
+extern int proc_fd_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask);
static inline unsigned int proc_fd(struct inode *inode)
{
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index e3cda0b5968f..501889856461 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* proc/fs/generic.c --- generic routines for the proc-fs
*
@@ -8,12 +9,14 @@
* Copyright (C) 1997 Theodore Ts'o
*/
+#include <linux/cache.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/namei.h>
#include <linux/slab.h>
#include <linux/printk.h>
#include <linux/mount.h>
@@ -23,12 +26,24 @@
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/uaccess.h>
+#include <linux/seq_file.h>
#include "internal.h"
static DEFINE_RWLOCK(proc_subdir_lock);
-static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
+struct kmem_cache *proc_dir_entry_cache __ro_after_init;
+
+void pde_free(struct proc_dir_entry *pde)
+{
+ if (S_ISLNK(pde->mode))
+ kfree(pde->data);
+ if (pde->name != pde->inline_name)
+ kfree(pde->name);
+ kmem_cache_free(proc_dir_entry_cache, pde);
+}
+
+static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len)
{
if (len < de->namelen)
return -1;
@@ -60,7 +75,7 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
struct proc_dir_entry *de = rb_entry(node,
struct proc_dir_entry,
subdir_node);
- int result = proc_match(len, name, de);
+ int result = proc_match(name, de, len);
if (result < 0)
node = node->rb_left;
@@ -83,7 +98,7 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir,
struct proc_dir_entry *this = rb_entry(*new,
struct proc_dir_entry,
subdir_node);
- int result = proc_match(de->namelen, de->name, this);
+ int result = proc_match(de->name, this, de->namelen);
parent = *new;
if (result < 0)
@@ -100,33 +115,38 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir,
return true;
}
-static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
+static int proc_notify_change(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct iattr *iattr)
{
struct inode *inode = d_inode(dentry);
struct proc_dir_entry *de = PDE(inode);
int error;
- error = setattr_prepare(dentry, iattr);
+ error = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
if (error)
return error;
- setattr_copy(inode, iattr);
- mark_inode_dirty(inode);
+ setattr_copy(&nop_mnt_idmap, inode, iattr);
proc_set_user(de, inode->i_uid, inode->i_gid);
de->mode = inode->i_mode;
return 0;
}
-static int proc_getattr(const struct path *path, struct kstat *stat,
+static int proc_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
struct proc_dir_entry *de = PDE(inode);
- if (de && de->nlink)
- set_nlink(inode, de->nlink);
+ if (de) {
+ nlink_t nlink = READ_ONCE(de->nlink);
+ if (nlink > 0) {
+ set_nlink(inode, nlink);
+ }
+ }
- generic_fillattr(inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
return 0;
}
@@ -144,24 +164,15 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
{
const char *cp = name, *next;
struct proc_dir_entry *de;
- unsigned int len;
- de = *ret;
- if (!de)
- de = &proc_root;
-
- while (1) {
- next = strchr(cp, '/');
- if (!next)
- break;
-
- len = next - cp;
- de = pde_subdir_find(de, cp, len);
+ de = *ret ?: &proc_root;
+ while ((next = strchr(cp, '/')) != NULL) {
+ de = pde_subdir_find(de, cp, next - cp);
if (!de) {
WARN(1, "name '%s'\n", name);
return -ENOENT;
}
- cp += len + 1;
+ cp = next + 1;
}
*residual = cp;
*ret = de;
@@ -191,8 +202,8 @@ int proc_alloc_inum(unsigned int *inum)
{
int i;
- i = ida_simple_get(&proc_inum_ida, 0, UINT_MAX - PROC_DYNAMIC_FIRST + 1,
- GFP_KERNEL);
+ i = ida_alloc_max(&proc_inum_ida, UINT_MAX - PROC_DYNAMIC_FIRST,
+ GFP_KERNEL);
if (i < 0)
return i;
@@ -202,15 +213,36 @@ int proc_alloc_inum(unsigned int *inum)
void proc_free_inum(unsigned int inum)
{
- ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
+ ida_free(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
+}
+
+static int proc_misc_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
+{
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ if (atomic_read(&PDE(d_inode(dentry))->in_use) < 0)
+ return 0; /* revalidate */
+ return 1;
+}
+
+static int proc_misc_d_delete(const struct dentry *dentry)
+{
+ return atomic_read(&PDE(d_inode(dentry))->in_use) < 0;
}
+static const struct dentry_operations proc_misc_dentry_ops = {
+ .d_revalidate = proc_misc_d_revalidate,
+ .d_delete = proc_misc_d_delete,
+};
+
/*
* Don't create negative dentries here, return -ENOENT by hand
* instead.
*/
-struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
- struct dentry *dentry)
+struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
+ struct proc_dir_entry *de)
{
struct inode *inode;
@@ -222,9 +254,11 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
inode = proc_get_inode(dir->i_sb, de);
if (!inode)
return ERR_PTR(-ENOMEM);
- d_set_d_op(dentry, &simple_dentry_operations);
- d_add(dentry, inode);
- return NULL;
+ if (de->flags & PROC_ENTRY_FORCE_LOOKUP)
+ return d_splice_alias_ops(inode, dentry,
+ &proc_net_dentry_ops);
+ return d_splice_alias_ops(inode, dentry,
+ &proc_misc_dentry_ops);
}
read_unlock(&proc_subdir_lock);
return ERR_PTR(-ENOENT);
@@ -233,7 +267,12 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
- return proc_lookup_de(PDE(dir), dir, dentry);
+ struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb);
+
+ if (fs_info->pidonly == PROC_PIDONLY_ON)
+ return ERR_PTR(-ENOENT);
+
+ return proc_lookup_de(dir, dentry, PDE(dir));
}
/*
@@ -245,17 +284,17 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
* value of the readdir() call, as long as it's non-negative
* for success..
*/
-int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
- struct dir_context *ctx)
+int proc_readdir_de(struct file *file, struct dir_context *ctx,
+ struct proc_dir_entry *de)
{
int i;
if (!dir_emit_dots(file, ctx))
return 0;
+ i = ctx->pos - 2;
read_lock(&proc_subdir_lock);
de = pde_subdir_first(de);
- i = ctx->pos - 2;
for (;;) {
if (!de) {
read_unlock(&proc_subdir_lock);
@@ -276,8 +315,8 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
pde_put(de);
return 0;
}
- read_lock(&proc_subdir_lock);
ctx->pos++;
+ read_lock(&proc_subdir_lock);
next = pde_subdir_next(de);
pde_put(de);
de = next;
@@ -289,8 +328,12 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
int proc_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
+ struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
+
+ if (fs_info->pidonly == PROC_PIDONLY_ON)
+ return 1;
- return proc_readdir_de(PDE(inode), file, ctx);
+ return proc_readdir_de(file, ctx, PDE(inode));
}
/*
@@ -304,6 +347,17 @@ static const struct file_operations proc_dir_operations = {
.iterate_shared = proc_readdir,
};
+static int proc_net_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
+{
+ return 0;
+}
+
+const struct dentry_operations proc_net_dentry_ops = {
+ .d_revalidate = proc_net_d_revalidate,
+ .d_delete = always_delete_dentry,
+};
+
/*
* proc directories can do almost nothing..
*/
@@ -313,13 +367,34 @@ static const struct inode_operations proc_dir_inode_operations = {
.setattr = proc_notify_change,
};
-static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
+static void pde_set_flags(struct proc_dir_entry *pde)
{
- int ret;
+ const struct proc_ops *proc_ops = pde->proc_ops;
- ret = proc_alloc_inum(&dp->low_ino);
- if (ret)
- return ret;
+ if (!proc_ops)
+ return;
+
+ if (proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
+ pde->flags |= PROC_ENTRY_PERMANENT;
+ if (proc_ops->proc_read_iter)
+ pde->flags |= PROC_ENTRY_proc_read_iter;
+#ifdef CONFIG_COMPAT
+ if (proc_ops->proc_compat_ioctl)
+ pde->flags |= PROC_ENTRY_proc_compat_ioctl;
+#endif
+ if (proc_ops->proc_lseek)
+ pde->flags |= PROC_ENTRY_proc_lseek;
+}
+
+/* returns the registered entry, or frees dp and returns NULL on failure */
+struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
+ struct proc_dir_entry *dp)
+{
+ if (proc_alloc_inum(&dp->low_ino))
+ goto out_free_entry;
+
+ if (!S_ISDIR(dp->mode))
+ pde_set_flags(dp);
write_lock(&proc_subdir_lock);
dp->parent = dir;
@@ -327,12 +402,17 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
WARN(1, "proc_dir_entry '%s/%s' already registered\n",
dir->name, dp->name);
write_unlock(&proc_subdir_lock);
- proc_free_inum(dp->low_ino);
- return -EEXIST;
+ goto out_free_inum;
}
+ dir->nlink++;
write_unlock(&proc_subdir_lock);
- return 0;
+ return dp;
+out_free_inum:
+ proc_free_inum(dp->low_ino);
+out_free_entry:
+ pde_free(dp);
+ return NULL;
}
static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
@@ -352,6 +432,14 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
WARN(1, "name len %u\n", qstr.len);
return NULL;
}
+ if (qstr.len == 1 && fn[0] == '.') {
+ WARN(1, "name '.'\n");
+ return NULL;
+ }
+ if (qstr.len == 2 && fn[0] == '.' && fn[1] == '.') {
+ WARN(1, "name '..'\n");
+ return NULL;
+ }
if (*parent == &proc_root && name_to_int(&qstr) != ~0U) {
WARN(1, "create '/proc/%s' by hand\n", qstr.name);
return NULL;
@@ -361,20 +449,34 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
return NULL;
}
- ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL);
+ ent = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
if (!ent)
goto out;
+ if (qstr.len + 1 <= SIZEOF_PDE_INLINE_NAME) {
+ ent->name = ent->inline_name;
+ } else {
+ ent->name = kmalloc(qstr.len + 1, GFP_KERNEL);
+ if (!ent->name) {
+ pde_free(ent);
+ return NULL;
+ }
+ }
+
memcpy(ent->name, fn, qstr.len + 1);
ent->namelen = qstr.len;
ent->mode = mode;
ent->nlink = nlink;
ent->subdir = RB_ROOT;
- atomic_set(&ent->count, 1);
+ refcount_set(&ent->refcnt, 1);
spin_lock_init(&ent->pde_unload_lock);
INIT_LIST_HEAD(&ent->pde_openers);
proc_set_user(ent, (*parent)->uid, (*parent)->gid);
+ /* Revalidate everything under /proc/${pid}/net */
+ if ((*parent)->flags & PROC_ENTRY_FORCE_LOOKUP)
+ pde_force_lookup(ent);
+
out:
return ent;
}
@@ -388,17 +490,13 @@ struct proc_dir_entry *proc_symlink(const char *name,
(S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1);
if (ent) {
- ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
+ ent->size = strlen(dest);
+ ent->data = kmemdup(dest, ent->size + 1, GFP_KERNEL);
if (ent->data) {
- strcpy((char*)ent->data,dest);
ent->proc_iops = &proc_link_inode_operations;
- if (proc_register(parent, ent) < 0) {
- kfree(ent->data);
- kfree(ent);
- ent = NULL;
- }
+ ent = proc_register(parent, ent);
} else {
- kfree(ent);
+ pde_free(ent);
ent = NULL;
}
}
@@ -406,8 +504,8 @@ struct proc_dir_entry *proc_symlink(const char *name,
}
EXPORT_SYMBOL(proc_symlink);
-struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
- struct proc_dir_entry *parent, void *data)
+struct proc_dir_entry *_proc_mkdir(const char *name, umode_t mode,
+ struct proc_dir_entry *parent, void *data, bool force_lookup)
{
struct proc_dir_entry *ent;
@@ -417,17 +515,22 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
if (ent) {
ent->data = data;
- ent->proc_fops = &proc_dir_operations;
+ ent->proc_dir_ops = &proc_dir_operations;
ent->proc_iops = &proc_dir_inode_operations;
- parent->nlink++;
- if (proc_register(parent, ent) < 0) {
- kfree(ent);
- parent->nlink--;
- ent = NULL;
+ if (force_lookup) {
+ pde_force_lookup(ent);
}
+ ent = proc_register(parent, ent);
}
return ent;
}
+EXPORT_SYMBOL_GPL(_proc_mkdir);
+
+struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
+ struct proc_dir_entry *parent, void *data)
+{
+ return _proc_mkdir(name, mode, parent, data, false);
+}
EXPORT_SYMBOL_GPL(proc_mkdir_data);
struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
@@ -452,53 +555,128 @@ struct proc_dir_entry *proc_create_mount_point(const char *name)
ent = __proc_create(&parent, name, mode, 2);
if (ent) {
ent->data = NULL;
- ent->proc_fops = NULL;
+ ent->proc_dir_ops = NULL;
ent->proc_iops = NULL;
- parent->nlink++;
- if (proc_register(parent, ent) < 0) {
- kfree(ent);
- parent->nlink--;
- ent = NULL;
- }
+ ent = proc_register(parent, ent);
}
return ent;
}
EXPORT_SYMBOL(proc_create_mount_point);
-struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
- struct proc_dir_entry *parent,
- const struct file_operations *proc_fops,
- void *data)
+struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
+ struct proc_dir_entry **parent, void *data)
{
- struct proc_dir_entry *pde;
+ struct proc_dir_entry *p;
+
if ((mode & S_IFMT) == 0)
mode |= S_IFREG;
-
- if (!S_ISREG(mode)) {
- WARN_ON(1); /* use proc_mkdir() */
+ if ((mode & S_IALLUGO) == 0)
+ mode |= S_IRUGO;
+ if (WARN_ON_ONCE(!S_ISREG(mode)))
return NULL;
+
+ p = __proc_create(parent, name, mode, 1);
+ if (p) {
+ p->proc_iops = &proc_file_inode_operations;
+ p->data = data;
}
+ return p;
+}
- BUG_ON(proc_fops == NULL);
+struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
+ struct proc_dir_entry *parent,
+ const struct proc_ops *proc_ops, void *data)
+{
+ struct proc_dir_entry *p;
- if ((mode & S_IALLUGO) == 0)
- mode |= S_IRUGO;
- pde = __proc_create(&parent, name, mode, 1);
- if (!pde)
- goto out;
- pde->proc_fops = proc_fops;
- pde->data = data;
- pde->proc_iops = &proc_file_inode_operations;
- if (proc_register(parent, pde) < 0)
- goto out_free;
- return pde;
-out_free:
- kfree(pde);
-out:
- return NULL;
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ p->proc_ops = proc_ops;
+ return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_data);
+struct proc_dir_entry *proc_create(const char *name, umode_t mode,
+ struct proc_dir_entry *parent,
+ const struct proc_ops *proc_ops)
+{
+ return proc_create_data(name, mode, parent, proc_ops, NULL);
+}
+EXPORT_SYMBOL(proc_create);
+
+static int proc_seq_open(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *de = PDE(inode);
+
+ if (de->state_size)
+ return seq_open_private(file, de->seq_ops, de->state_size);
+ return seq_open(file, de->seq_ops);
+}
+
+static int proc_seq_release(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *de = PDE(inode);
+
+ if (de->state_size)
+ return seq_release_private(inode, file);
+ return seq_release(inode, file);
+}
+
+static const struct proc_ops proc_seq_ops = {
+ /* not permanent -- can call into arbitrary seq_operations */
+ .proc_open = proc_seq_open,
+ .proc_read_iter = seq_read_iter,
+ .proc_lseek = seq_lseek,
+ .proc_release = proc_seq_release,
+};
+
+struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
+ struct proc_dir_entry *parent, const struct seq_operations *ops,
+ unsigned int state_size, void *data)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ p->proc_ops = &proc_seq_ops;
+ p->seq_ops = ops;
+ p->state_size = state_size;
+ return proc_register(parent, p);
+}
+EXPORT_SYMBOL(proc_create_seq_private);
+
+static int proc_single_open(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *de = PDE(inode);
+
+ return single_open(file, de->single_show, de->data);
+}
+
+static const struct proc_ops proc_single_ops = {
+ /* not permanent -- can call into arbitrary ->single_show */
+ .proc_open = proc_single_open,
+ .proc_read_iter = seq_read_iter,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+};
+
+struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
+ struct proc_dir_entry *parent,
+ int (*show)(struct seq_file *, void *), void *data)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ p->proc_ops = &proc_single_ops;
+ p->single_show = show;
+ return proc_register(parent, p);
+}
+EXPORT_SYMBOL(proc_create_single_data);
+
void proc_set_size(struct proc_dir_entry *de, loff_t size)
{
de->size = size;
@@ -512,19 +690,18 @@ void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid)
}
EXPORT_SYMBOL(proc_set_user);
-static void free_proc_entry(struct proc_dir_entry *de)
+void pde_put(struct proc_dir_entry *pde)
{
- proc_free_inum(de->low_ino);
-
- if (S_ISLNK(de->mode))
- kfree(de->data);
- kfree(de);
+ if (refcount_dec_and_test(&pde->refcnt)) {
+ proc_free_inum(pde->low_ino);
+ pde_free(pde);
+ }
}
-void pde_put(struct proc_dir_entry *pde)
+static void pde_erase(struct proc_dir_entry *pde, struct proc_dir_entry *parent)
{
- if (atomic_dec_and_test(&pde->count))
- free_proc_entry(pde);
+ rb_erase(&pde->subdir_node, &parent->subdir);
+ RB_CLEAR_NODE(&pde->subdir_node);
}
/*
@@ -544,8 +721,16 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
len = strlen(fn);
de = pde_subdir_find(parent, fn, len);
- if (de)
- rb_erase(&de->subdir_node, &parent->subdir);
+ if (de) {
+ if (unlikely(pde_is_permanent(de))) {
+ WARN(1, "removing permanent /proc entry '%s'", de->name);
+ de = NULL;
+ } else {
+ pde_erase(de, parent);
+ if (S_ISDIR(de->mode))
+ parent->nlink--;
+ }
+ }
write_unlock(&proc_subdir_lock);
if (!de) {
WARN(1, "name '%s'\n", name);
@@ -554,9 +739,6 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
proc_entry_rundown(de);
- if (S_ISDIR(de->mode))
- parent->nlink--;
- de->nlink = 0;
WARN(pde_subdir_first(de),
"%s: removing non-empty directory '%s/%s', leaking at least '%s'\n",
__func__, de->parent->name, de->name, pde_subdir_first(de)->name);
@@ -582,23 +764,34 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
write_unlock(&proc_subdir_lock);
return -ENOENT;
}
- rb_erase(&root->subdir_node, &parent->subdir);
+ if (unlikely(pde_is_permanent(root))) {
+ write_unlock(&proc_subdir_lock);
+ WARN(1, "removing permanent /proc entry '%s/%s'",
+ root->parent->name, root->name);
+ return -EINVAL;
+ }
+ pde_erase(root, parent);
de = root;
while (1) {
next = pde_subdir_first(de);
if (next) {
- rb_erase(&next->subdir_node, &de->subdir);
+ if (unlikely(pde_is_permanent(next))) {
+ write_unlock(&proc_subdir_lock);
+ WARN(1, "removing permanent /proc entry '%s/%s'",
+ next->parent->name, next->name);
+ return -EINVAL;
+ }
+ pde_erase(next, de);
de = next;
continue;
}
- write_unlock(&proc_subdir_lock);
-
- proc_entry_rundown(de);
next = de->parent;
if (S_ISDIR(de->mode))
next->nlink--;
- de->nlink = 0;
+ write_unlock(&proc_subdir_lock);
+
+ proc_entry_rundown(de);
if (de == root)
break;
pde_put(de);
@@ -625,8 +818,26 @@ void proc_remove(struct proc_dir_entry *de)
}
EXPORT_SYMBOL(proc_remove);
-void *PDE_DATA(const struct inode *inode)
+/*
+ * Pull a user buffer into memory and pass it to the file's write handler if
+ * one is supplied. The ->write() method is permitted to modify the
+ * kernel-side buffer.
+ */
+ssize_t proc_simple_write(struct file *f, const char __user *ubuf, size_t size,
+ loff_t *_pos)
{
- return __PDE_DATA(inode);
+ struct proc_dir_entry *pde = PDE(file_inode(f));
+ char *buf;
+ int ret;
+
+ if (!pde->write)
+ return -EACCES;
+ if (size == 0 || size > PAGE_SIZE - 1)
+ return -EINVAL;
+ buf = memdup_user_nul(ubuf, size);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+ ret = pde->write(f, buf, size);
+ kfree(buf);
+ return ret == 0 ? size : ret;
}
-EXPORT_SYMBOL(PDE_DATA);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index e250910cffc8..b7634f975d98 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -1,9 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/proc/inode.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
+#include <linux/cache.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/kernel.h>
@@ -22,43 +24,37 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/mount.h>
-#include <linux/magic.h>
-
-#include <linux/uaccess.h>
+#include <linux/bug.h>
#include "internal.h"
static void proc_evict_inode(struct inode *inode)
{
- struct proc_dir_entry *de;
struct ctl_table_header *head;
+ struct proc_inode *ei = PROC_I(inode);
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
/* Stop tracking associated processes */
- put_pid(PROC_I(inode)->pid);
+ if (ei->pid)
+ proc_pid_evict_inode(ei);
- /* Let go of any associated proc directory entry */
- de = PDE(inode);
- if (de)
- pde_put(de);
-
- head = PROC_I(inode)->sysctl;
+ head = ei->sysctl;
if (head) {
- RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
+ WRITE_ONCE(ei->sysctl, NULL);
proc_sys_evict_inode(inode, head);
}
}
-static struct kmem_cache * proc_inode_cachep;
+static struct kmem_cache *proc_inode_cachep __ro_after_init;
+static struct kmem_cache *pde_opener_cache __ro_after_init;
static struct inode *proc_alloc_inode(struct super_block *sb)
{
struct proc_inode *ei;
- struct inode *inode;
- ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, proc_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
ei->pid = NULL;
@@ -67,20 +63,21 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
ei->pde = NULL;
ei->sysctl = NULL;
ei->sysctl_entry = NULL;
+ INIT_HLIST_NODE(&ei->sibling_inodes);
ei->ns_ops = NULL;
- inode = &ei->vfs_inode;
- return inode;
+ return &ei->vfs_inode;
}
-static void proc_i_callback(struct rcu_head *head)
+static void proc_free_inode(struct inode *inode)
{
- struct inode *inode = container_of(head, struct inode, i_rcu);
- kmem_cache_free(proc_inode_cachep, PROC_I(inode));
-}
+ struct proc_inode *ei = PROC_I(inode);
-static void proc_destroy_inode(struct inode *inode)
-{
- call_rcu(&inode->i_rcu, proc_i_callback);
+ if (ei->pid)
+ put_pid(ei->pid);
+ /* Let go of any associated proc directory entry */
+ if (ei->pde)
+ pde_put(ei->pde);
+ kmem_cache_free(proc_inode_cachep, PROC_I(inode));
}
static void init_once(void *foo)
@@ -90,36 +87,109 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-void __init proc_init_inodecache(void)
+void __init proc_init_kmemcache(void)
{
proc_inode_cachep = kmem_cache_create("proc_inode_cache",
sizeof(struct proc_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT|
+ SLAB_ACCOUNT|
SLAB_PANIC),
init_once);
+ pde_opener_cache =
+ kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0,
+ SLAB_ACCOUNT|SLAB_PANIC, NULL);
+ proc_dir_entry_cache = kmem_cache_create_usercopy(
+ "proc_dir_entry", SIZEOF_PDE, 0, SLAB_PANIC,
+ offsetof(struct proc_dir_entry, inline_name),
+ SIZEOF_PDE_INLINE_NAME, NULL);
+ BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE);
+}
+
+void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
+{
+ struct hlist_node *node;
+ struct super_block *old_sb = NULL;
+
+ rcu_read_lock();
+ while ((node = hlist_first_rcu(inodes))) {
+ struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes);
+ struct super_block *sb;
+ struct inode *inode;
+
+ spin_lock(lock);
+ hlist_del_init_rcu(&ei->sibling_inodes);
+ spin_unlock(lock);
+
+ inode = &ei->vfs_inode;
+ sb = inode->i_sb;
+ if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active))
+ continue;
+ inode = igrab(inode);
+ rcu_read_unlock();
+ if (sb != old_sb) {
+ if (old_sb)
+ deactivate_super(old_sb);
+ old_sb = sb;
+ }
+ if (unlikely(!inode)) {
+ rcu_read_lock();
+ continue;
+ }
+
+ if (S_ISDIR(inode->i_mode)) {
+ struct dentry *dir = d_find_any_alias(inode);
+ if (dir) {
+ d_invalidate(dir);
+ dput(dir);
+ }
+ } else {
+ struct dentry *dentry;
+ while ((dentry = d_find_alias(inode))) {
+ d_invalidate(dentry);
+ dput(dentry);
+ }
+ }
+ iput(inode);
+
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+ if (old_sb)
+ deactivate_super(old_sb);
+}
+
+static inline const char *hidepid2str(enum proc_hidepid v)
+{
+ switch (v) {
+ case HIDEPID_OFF: return "off";
+ case HIDEPID_NO_ACCESS: return "noaccess";
+ case HIDEPID_INVISIBLE: return "invisible";
+ case HIDEPID_NOT_PTRACEABLE: return "ptraceable";
+ }
+ WARN_ONCE(1, "bad hide_pid value: %d\n", v);
+ return "unknown";
}
static int proc_show_options(struct seq_file *seq, struct dentry *root)
{
- struct super_block *sb = root->d_sb;
- struct pid_namespace *pid = sb->s_fs_info;
+ struct proc_fs_info *fs_info = proc_sb_info(root->d_sb);
- if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID))
- seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid));
- if (pid->hide_pid != HIDEPID_OFF)
- seq_printf(seq, ",hidepid=%u", pid->hide_pid);
+ if (!gid_eq(fs_info->pid_gid, GLOBAL_ROOT_GID))
+ seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, fs_info->pid_gid));
+ if (fs_info->hide_pid != HIDEPID_OFF)
+ seq_printf(seq, ",hidepid=%s", hidepid2str(fs_info->hide_pid));
+ if (fs_info->pidonly != PROC_PIDONLY_OFF)
+ seq_printf(seq, ",subset=pid");
return 0;
}
-static const struct super_operations proc_sops = {
+const struct super_operations proc_sops = {
.alloc_inode = proc_alloc_inode,
- .destroy_inode = proc_destroy_inode,
- .drop_inode = generic_delete_inode,
+ .free_inode = proc_free_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = proc_evict_inode,
.statfs = simple_statfs,
- .remount_fs = proc_remount,
.show_options = proc_show_options,
};
@@ -127,17 +197,26 @@ enum {BIAS = -1U<<31};
static inline int use_pde(struct proc_dir_entry *pde)
{
- return atomic_inc_unless_negative(&pde->in_use);
+ return likely(atomic_inc_unless_negative(&pde->in_use));
}
static void unuse_pde(struct proc_dir_entry *pde)
{
- if (atomic_dec_return(&pde->in_use) == BIAS)
+ if (unlikely(atomic_dec_return(&pde->in_use) == BIAS))
complete(pde->pde_unload_completion);
}
-/* pde is locked */
+/*
+ * At most 2 contexts can enter this function: the one doing the last
+ * close on the descriptor and whoever is deleting PDE itself.
+ *
+ * First to enter calls ->proc_release hook and signals its completion
+ * to the second one which waits and then does nothing.
+ *
+ * PDE is locked on entry, unlocked on exit.
+ */
static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
+ __releases(&pde->pde_unload_lock)
{
/*
* close() (proc_reg_release()) can't delete an entry and proceed:
@@ -145,9 +224,6 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
*
* rmmod (remove_proc_entry() et al) can't delete an entry and proceed:
* "struct file" needs to be available at the right moment.
- *
- * Therefore, first process to enter this function does ->release() and
- * signals its completion to the other process which does nothing.
*/
if (pdeo->closing) {
/* somebody else is doing that, just wait */
@@ -155,19 +231,24 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
pdeo->c = &c;
spin_unlock(&pde->pde_unload_lock);
wait_for_completion(&c);
- spin_lock(&pde->pde_unload_lock);
} else {
struct file *file;
+ struct completion *c;
+
pdeo->closing = true;
spin_unlock(&pde->pde_unload_lock);
+
file = pdeo->file;
- pde->proc_fops->release(file_inode(file), file);
+ pde->proc_ops->proc_release(file_inode(file), file);
+
spin_lock(&pde->pde_unload_lock);
- /* After ->release. */
+ /* Strictly after ->proc_release, see above. */
list_del(&pdeo->lh);
- if (pdeo->c)
- complete(pdeo->c);
- kfree(pdeo);
+ c = pdeo->c;
+ spin_unlock(&pde->pde_unload_lock);
+ if (unlikely(c))
+ complete(c);
+ kmem_cache_free(pde_opener_cache, pdeo);
}
}
@@ -186,6 +267,7 @@ void proc_entry_rundown(struct proc_dir_entry *de)
struct pde_opener *pdeo;
pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
close_pdeo(de, pdeo);
+ spin_lock(&de->pde_unload_lock);
}
spin_unlock(&de->pde_unload_lock);
}
@@ -194,124 +276,191 @@ static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
loff_t rv = -EINVAL;
- if (use_pde(pde)) {
- loff_t (*llseek)(struct file *, loff_t, int);
- llseek = pde->proc_fops->llseek;
- if (!llseek)
- llseek = default_llseek;
- rv = llseek(file, offset, whence);
+
+ if (pde_is_permanent(pde)) {
+ return pde->proc_ops->proc_lseek(file, offset, whence);
+ } else if (use_pde(pde)) {
+ rv = pde->proc_ops->proc_lseek(file, offset, whence);
unuse_pde(pde);
}
return rv;
}
+static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(iocb->ki_filp));
+ ssize_t ret;
+
+ if (pde_is_permanent(pde))
+ return pde->proc_ops->proc_read_iter(iocb, iter);
+
+ if (!use_pde(pde))
+ return -EIO;
+ ret = pde->proc_ops->proc_read_iter(iocb, iter);
+ unuse_pde(pde);
+ return ret;
+}
+
+static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+ const auto read = pde->proc_ops->proc_read;
+ if (read)
+ return read(file, buf, count, ppos);
+ return -EIO;
+}
+
static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
- ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv = -EIO;
- if (use_pde(pde)) {
- read = pde->proc_fops->read;
- if (read)
- rv = read(file, buf, count, ppos);
+
+ if (pde_is_permanent(pde)) {
+ return pde_read(pde, file, buf, count, ppos);
+ } else if (use_pde(pde)) {
+ rv = pde_read(pde, file, buf, count, ppos);
unuse_pde(pde);
}
return rv;
}
+static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+ const auto write = pde->proc_ops->proc_write;
+ if (write)
+ return write(file, buf, count, ppos);
+ return -EIO;
+}
+
static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
- ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv = -EIO;
- if (use_pde(pde)) {
- write = pde->proc_fops->write;
- if (write)
- rv = write(file, buf, count, ppos);
+
+ if (pde_is_permanent(pde)) {
+ return pde_write(pde, file, buf, count, ppos);
+ } else if (use_pde(pde)) {
+ rv = pde_write(pde, file, buf, count, ppos);
unuse_pde(pde);
}
return rv;
}
-static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts)
+static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts)
+{
+ const auto poll = pde->proc_ops->proc_poll;
+ if (poll)
+ return poll(file, pts);
+ return DEFAULT_POLLMASK;
+}
+
+static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
- unsigned int rv = DEFAULT_POLLMASK;
- unsigned int (*poll)(struct file *, struct poll_table_struct *);
- if (use_pde(pde)) {
- poll = pde->proc_fops->poll;
- if (poll)
- rv = poll(file, pts);
+ __poll_t rv = DEFAULT_POLLMASK;
+
+ if (pde_is_permanent(pde)) {
+ return pde_poll(pde, file, pts);
+ } else if (use_pde(pde)) {
+ rv = pde_poll(pde, file, pts);
unuse_pde(pde);
}
return rv;
}
+static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
+{
+ const auto ioctl = pde->proc_ops->proc_ioctl;
+ if (ioctl)
+ return ioctl(file, cmd, arg);
+ return -ENOTTY;
+}
+
static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
long rv = -ENOTTY;
- long (*ioctl)(struct file *, unsigned int, unsigned long);
- if (use_pde(pde)) {
- ioctl = pde->proc_fops->unlocked_ioctl;
- if (ioctl)
- rv = ioctl(file, cmd, arg);
+
+ if (pde_is_permanent(pde)) {
+ return pde_ioctl(pde, file, cmd, arg);
+ } else if (use_pde(pde)) {
+ rv = pde_ioctl(pde, file, cmd, arg);
unuse_pde(pde);
}
return rv;
}
#ifdef CONFIG_COMPAT
+static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
+{
+ const auto compat_ioctl = pde->proc_ops->proc_compat_ioctl;
+ if (compat_ioctl)
+ return compat_ioctl(file, cmd, arg);
+ return -ENOTTY;
+}
+
static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
long rv = -ENOTTY;
- long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
- if (use_pde(pde)) {
- compat_ioctl = pde->proc_fops->compat_ioctl;
- if (compat_ioctl)
- rv = compat_ioctl(file, cmd, arg);
+ if (pde_is_permanent(pde)) {
+ return pde_compat_ioctl(pde, file, cmd, arg);
+ } else if (use_pde(pde)) {
+ rv = pde_compat_ioctl(pde, file, cmd, arg);
unuse_pde(pde);
}
return rv;
}
#endif
+static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma)
+{
+ const auto mmap = pde->proc_ops->proc_mmap;
+ if (mmap)
+ return mmap(file, vma);
+ return -EIO;
+}
+
static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
int rv = -EIO;
- int (*mmap)(struct file *, struct vm_area_struct *);
- if (use_pde(pde)) {
- mmap = pde->proc_fops->mmap;
- if (mmap)
- rv = mmap(file, vma);
+
+ if (pde_is_permanent(pde)) {
+ return pde_mmap(pde, file, vma);
+ } else if (use_pde(pde)) {
+ rv = pde_mmap(pde, file, vma);
unuse_pde(pde);
}
return rv;
}
static unsigned long
-proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
+pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr,
unsigned long len, unsigned long pgoff,
unsigned long flags)
{
- struct proc_dir_entry *pde = PDE(file_inode(file));
- unsigned long rv = -EIO;
-
- if (use_pde(pde)) {
- typeof(proc_reg_get_unmapped_area) *get_area;
+ if (pde->proc_ops->proc_get_unmapped_area)
+ return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags);
- get_area = pde->proc_fops->get_unmapped_area;
#ifdef CONFIG_MMU
- if (!get_area)
- get_area = current->mm->get_unmapped_area;
+ return mm_get_unmapped_area(file, orig_addr, len, pgoff, flags);
#endif
- if (get_area)
- rv = get_area(file, orig_addr, len, pgoff, flags);
- else
- rv = orig_addr;
+ return orig_addr;
+}
+
+static unsigned long
+proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(file));
+ unsigned long rv = -EIO;
+
+ if (pde_is_permanent(pde)) {
+ return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
+ } else if (use_pde(pde)) {
+ rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
unuse_pde(pde);
}
return rv;
@@ -321,10 +470,19 @@ static int proc_reg_open(struct inode *inode, struct file *file)
{
struct proc_dir_entry *pde = PDE(inode);
int rv = 0;
- int (*open)(struct inode *, struct file *);
- int (*release)(struct inode *, struct file *);
+ typeof_member(struct proc_ops, proc_open) open;
struct pde_opener *pdeo;
+ if (!pde_has_proc_lseek(pde))
+ file->f_mode &= ~FMODE_LSEEK;
+
+ if (pde_is_permanent(pde)) {
+ open = pde->proc_ops->proc_open;
+ if (open)
+ rv = open(inode, file);
+ return rv;
+ }
+
/*
* Ensure that
* 1) PDE's ->release hook will be called no matter what
@@ -336,31 +494,36 @@ static int proc_reg_open(struct inode *inode, struct file *file)
*
* Save every "struct file" with custom ->release hook.
*/
- pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL);
- if (!pdeo)
- return -ENOMEM;
-
- if (!use_pde(pde)) {
- kfree(pdeo);
+ if (!use_pde(pde))
return -ENOENT;
+
+ const auto release = pde->proc_ops->proc_release;
+ if (release) {
+ pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL);
+ if (!pdeo) {
+ rv = -ENOMEM;
+ goto out_unuse;
+ }
}
- open = pde->proc_fops->open;
- release = pde->proc_fops->release;
+ open = pde->proc_ops->proc_open;
if (open)
rv = open(inode, file);
- if (rv == 0 && release) {
- /* To know what to release. */
- pdeo->file = file;
- pdeo->closing = false;
- pdeo->c = NULL;
- spin_lock(&pde->pde_unload_lock);
- list_add(&pdeo->lh, &pde->pde_openers);
- spin_unlock(&pde->pde_unload_lock);
- } else
- kfree(pdeo);
+ if (release) {
+ if (rv == 0) {
+ /* To know what to release. */
+ pdeo->file = file;
+ pdeo->closing = false;
+ pdeo->c = NULL;
+ spin_lock(&pde->pde_unload_lock);
+ list_add(&pdeo->lh, &pde->pde_openers);
+ spin_unlock(&pde->pde_unload_lock);
+ } else
+ kmem_cache_free(pde_opener_cache, pdeo);
+ }
+out_unuse:
unuse_pde(pde);
return rv;
}
@@ -369,11 +532,19 @@ static int proc_reg_release(struct inode *inode, struct file *file)
{
struct proc_dir_entry *pde = PDE(inode);
struct pde_opener *pdeo;
+
+ if (pde_is_permanent(pde)) {
+ const auto release = pde->proc_ops->proc_release;
+ if (release)
+ return release(inode, file);
+ return 0;
+ }
+
spin_lock(&pde->pde_unload_lock);
list_for_each_entry(pdeo, &pde->pde_openers, lh) {
if (pdeo->file == file) {
close_pdeo(pde, pdeo);
- break;
+ return 0;
}
}
spin_unlock(&pde->pde_unload_lock);
@@ -386,9 +557,19 @@ static const struct file_operations proc_reg_file_ops = {
.write = proc_reg_write,
.poll = proc_reg_poll,
.unlocked_ioctl = proc_reg_unlocked_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = proc_reg_compat_ioctl,
-#endif
+ .mmap = proc_reg_mmap,
+ .get_unmapped_area = proc_reg_get_unmapped_area,
+ .open = proc_reg_open,
+ .release = proc_reg_release,
+};
+
+static const struct file_operations proc_iter_file_ops = {
+ .llseek = proc_reg_llseek,
+ .read_iter = proc_reg_read_iter,
+ .write = proc_reg_write,
+ .splice_read = copy_splice_read,
+ .poll = proc_reg_poll,
+ .unlocked_ioctl = proc_reg_unlocked_ioctl,
.mmap = proc_reg_mmap,
.get_unmapped_area = proc_reg_get_unmapped_area,
.open = proc_reg_open,
@@ -396,12 +577,27 @@ static const struct file_operations proc_reg_file_ops = {
};
#ifdef CONFIG_COMPAT
-static const struct file_operations proc_reg_file_ops_no_compat = {
+static const struct file_operations proc_reg_file_ops_compat = {
.llseek = proc_reg_llseek,
.read = proc_reg_read,
.write = proc_reg_write,
.poll = proc_reg_poll,
.unlocked_ioctl = proc_reg_unlocked_ioctl,
+ .compat_ioctl = proc_reg_compat_ioctl,
+ .mmap = proc_reg_mmap,
+ .get_unmapped_area = proc_reg_get_unmapped_area,
+ .open = proc_reg_open,
+ .release = proc_reg_release,
+};
+
+static const struct file_operations proc_iter_file_ops_compat = {
+ .llseek = proc_reg_llseek,
+ .read_iter = proc_reg_read_iter,
+ .splice_read = copy_splice_read,
+ .write = proc_reg_write,
+ .poll = proc_reg_poll,
+ .unlocked_ioctl = proc_reg_unlocked_ioctl,
+ .compat_ioctl = proc_reg_compat_ioctl,
.mmap = proc_reg_mmap,
.get_unmapped_area = proc_reg_get_unmapped_area,
.open = proc_reg_open,
@@ -419,7 +615,7 @@ static const char *proc_get_link(struct dentry *dentry,
struct delayed_call *done)
{
struct proc_dir_entry *pde = PDE(inode);
- if (unlikely(!use_pde(pde)))
+ if (!use_pde(pde))
return ERR_PTR(-EINVAL);
set_delayed_call(done, proc_put_link, pde);
return pde->data;
@@ -431,87 +627,54 @@ const struct inode_operations proc_link_inode_operations = {
struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
{
- struct inode *inode = new_inode_pseudo(sb);
-
- if (inode) {
- inode->i_ino = de->low_ino;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
- PROC_I(inode)->pde = de;
-
- if (is_empty_pde(de)) {
- make_empty_dir_inode(inode);
- return inode;
- }
- if (de->mode) {
- inode->i_mode = de->mode;
- inode->i_uid = de->uid;
- inode->i_gid = de->gid;
- }
- if (de->size)
- inode->i_size = de->size;
- if (de->nlink)
- set_nlink(inode, de->nlink);
- WARN_ON(!de->proc_iops);
- inode->i_op = de->proc_iops;
- if (de->proc_fops) {
- if (S_ISREG(inode->i_mode)) {
-#ifdef CONFIG_COMPAT
- if (!de->proc_fops->compat_ioctl)
- inode->i_fop =
- &proc_reg_file_ops_no_compat;
- else
-#endif
- inode->i_fop = &proc_reg_file_ops;
- } else {
- inode->i_fop = de->proc_fops;
- }
- }
- } else
- pde_put(de);
- return inode;
-}
-
-int proc_fill_super(struct super_block *s, void *data, int silent)
-{
- struct pid_namespace *ns = get_pid_ns(s->s_fs_info);
- struct inode *root_inode;
- int ret;
+ struct inode *inode = new_inode(sb);
- if (!proc_parse_options(data, ns))
- return -EINVAL;
-
- /* User space would break if executables or devices appear on proc */
- s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
- s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
- s->s_blocksize = 1024;
- s->s_blocksize_bits = 10;
- s->s_magic = PROC_SUPER_MAGIC;
- s->s_op = &proc_sops;
- s->s_time_gran = 1;
+ if (!inode) {
+ pde_put(de);
+ return NULL;
+ }
- /*
- * procfs isn't actually a stacking filesystem; however, there is
- * too much magic going on inside it to permit stacking things on
- * top of it
- */
- s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
-
- pde_get(&proc_root);
- root_inode = proc_get_inode(s, &proc_root);
- if (!root_inode) {
- pr_err("proc_fill_super: get root inode failed\n");
- return -ENOMEM;
+ inode->i_private = de->data;
+ inode->i_ino = de->low_ino;
+ simple_inode_init_ts(inode);
+ PROC_I(inode)->pde = de;
+ if (is_empty_pde(de)) {
+ make_empty_dir_inode(inode);
+ return inode;
}
- s->s_root = d_make_root(root_inode);
- if (!s->s_root) {
- pr_err("proc_fill_super: allocate dentry failed\n");
- return -ENOMEM;
+ if (de->mode) {
+ inode->i_mode = de->mode;
+ inode->i_uid = de->uid;
+ inode->i_gid = de->gid;
}
+ if (de->size)
+ inode->i_size = de->size;
+ if (de->nlink)
+ set_nlink(inode, de->nlink);
- ret = proc_setup_self(s);
- if (ret) {
- return ret;
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_op = de->proc_iops;
+ if (pde_has_proc_read_iter(de))
+ inode->i_fop = &proc_iter_file_ops;
+ else
+ inode->i_fop = &proc_reg_file_ops;
+#ifdef CONFIG_COMPAT
+ if (pde_has_proc_compat_ioctl(de)) {
+ if (pde_has_proc_read_iter(de))
+ inode->i_fop = &proc_iter_file_ops_compat;
+ else
+ inode->i_fop = &proc_reg_file_ops_compat;
+ }
+#endif
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = de->proc_iops;
+ inode->i_fop = de->proc_dir_ops;
+ } else if (S_ISLNK(inode->i_mode)) {
+ inode->i_op = de->proc_iops;
+ inode->i_fop = NULL;
+ } else {
+ BUG();
}
- return proc_setup_thread_self(s);
+ return inode;
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index aa2b89071630..c1e8eb984da8 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -1,21 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Internal procfs definitions
*
* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
+#include <linux/refcount.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/binfmts.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
+#include <linux/mm.h>
struct ctl_table_header;
struct mempolicy;
@@ -31,33 +29,89 @@ struct mempolicy;
* subdir_node is used to build the rb tree "subdir" of the parent.
*/
struct proc_dir_entry {
+ /*
+ * number of callers into module in progress;
+ * negative -> it's going away RSN
+ */
+ atomic_t in_use;
+ refcount_t refcnt;
+ struct list_head pde_openers; /* who did ->open, but not ->release */
+ /* protects ->pde_openers and all struct pde_opener instances */
+ spinlock_t pde_unload_lock;
+ struct completion *pde_unload_completion;
+ const struct inode_operations *proc_iops;
+ union {
+ const struct proc_ops *proc_ops;
+ const struct file_operations *proc_dir_ops;
+ };
+ union {
+ const struct seq_operations *seq_ops;
+ int (*single_show)(struct seq_file *, void *);
+ };
+ proc_write_t write;
+ void *data;
+ unsigned int state_size;
unsigned int low_ino;
- umode_t mode;
nlink_t nlink;
kuid_t uid;
kgid_t gid;
loff_t size;
- const struct inode_operations *proc_iops;
- const struct file_operations *proc_fops;
struct proc_dir_entry *parent;
struct rb_root subdir;
struct rb_node subdir_node;
- void *data;
- atomic_t count; /* use count */
- atomic_t in_use; /* number of callers into module in progress; */
- /* negative -> it's going away RSN */
- struct completion *pde_unload_completion;
- struct list_head pde_openers; /* who did ->open, but not ->release */
- spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
+ char *name;
+ umode_t mode;
+ u8 flags;
u8 namelen;
- char name[];
+ char inline_name[];
} __randomize_layout;
+#define SIZEOF_PDE ( \
+ sizeof(struct proc_dir_entry) < 128 ? 128 : \
+ sizeof(struct proc_dir_entry) < 192 ? 192 : \
+ sizeof(struct proc_dir_entry) < 256 ? 256 : \
+ sizeof(struct proc_dir_entry) < 512 ? 512 : \
+ 0)
+#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry))
+
+static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
+{
+ return pde->flags & PROC_ENTRY_PERMANENT;
+}
+
+static inline void pde_make_permanent(struct proc_dir_entry *pde)
+{
+ pde->flags |= PROC_ENTRY_PERMANENT;
+}
+
+static inline bool pde_has_proc_read_iter(const struct proc_dir_entry *pde)
+{
+ return pde->flags & PROC_ENTRY_proc_read_iter;
+}
+
+static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde)
+{
+#ifdef CONFIG_COMPAT
+ return pde->flags & PROC_ENTRY_proc_compat_ioctl;
+#else
+ return false;
+#endif
+}
+
+static inline bool pde_has_proc_lseek(const struct proc_dir_entry *pde)
+{
+ return pde->flags & PROC_ENTRY_proc_lseek;
+}
+
+extern struct kmem_cache *proc_dir_entry_cache;
+void pde_free(struct proc_dir_entry *pde);
+
union proc_op {
int (*proc_get_link)(struct dentry *, struct path *);
int (*proc_show)(struct seq_file *m,
struct pid_namespace *ns, struct pid *pid,
struct task_struct *task);
+ int lsmid;
};
struct proc_inode {
@@ -66,8 +120,8 @@ struct proc_inode {
union proc_op op;
struct proc_dir_entry *pde;
struct ctl_table_header *sysctl;
- struct ctl_table *sysctl_entry;
- struct hlist_node sysctl_inodes;
+ const struct ctl_table *sysctl_entry;
+ struct hlist_node sibling_inodes;
const struct proc_ns_operations *ns_ops;
struct inode vfs_inode;
} __randomize_layout;
@@ -85,46 +139,20 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode)
return PROC_I(inode)->pde;
}
-static inline void *__PDE_DATA(const struct inode *inode)
-{
- return PDE(inode)->data;
-}
-
-static inline struct pid *proc_pid(struct inode *inode)
+static inline struct pid *proc_pid(const struct inode *inode)
{
return PROC_I(inode)->pid;
}
-static inline struct task_struct *get_proc_task(struct inode *inode)
+static inline struct task_struct *get_proc_task(const struct inode *inode)
{
return get_pid_task(proc_pid(inode), PIDTYPE_PID);
}
-void task_dump_owner(struct task_struct *task, mode_t mode,
+void task_dump_owner(struct task_struct *task, umode_t mode,
kuid_t *ruid, kgid_t *rgid);
-static inline unsigned name_to_int(const struct qstr *qstr)
-{
- const char *name = qstr->name;
- int len = qstr->len;
- unsigned n = 0;
-
- if (len > 1 && *name == '0')
- goto out;
- while (len-- > 0) {
- unsigned c = *name++ - '0';
- if (c > 9)
- goto out;
- if (n >= (~0U-9)/10)
- goto out;
- n *= 10;
- n += c;
- }
- return n;
-out:
- return ~0U;
-}
-
+unsigned name_to_int(const struct qstr *qstr);
/*
* Offset of the first process in the /proc root directory..
*/
@@ -133,11 +161,87 @@ out:
/* Worst case buffer size needed for holding an integer. */
#define PROC_NUMBUF 13
+#ifdef CONFIG_PAGE_MAPCOUNT
+/**
+ * folio_precise_page_mapcount() - Number of mappings of this folio page.
+ * @folio: The folio.
+ * @page: The page.
+ *
+ * The number of present user page table entries that reference this page
+ * as tracked via the RMAP: either referenced directly (PTE) or as part of
+ * a larger area that covers this page (e.g., PMD).
+ *
+ * Use this function only for the calculation of existing statistics
+ * (USS, PSS, mapcount_max) and for debugging purposes (/proc/kpagecount).
+ *
+ * Do not add new users.
+ *
+ * Returns: The number of mappings of this folio page. 0 for
+ * folios that are not mapped to user space or are not tracked via the RMAP
+ * (e.g., shared zeropage).
+ */
+static inline int folio_precise_page_mapcount(struct folio *folio,
+ struct page *page)
+{
+ int mapcount = atomic_read(&page->_mapcount) + 1;
+
+ if (page_mapcount_is_type(mapcount))
+ mapcount = 0;
+ if (folio_test_large(folio))
+ mapcount += folio_entire_mapcount(folio);
+
+ return mapcount;
+}
+#else /* !CONFIG_PAGE_MAPCOUNT */
+static inline int folio_precise_page_mapcount(struct folio *folio,
+ struct page *page)
+{
+ BUILD_BUG();
+}
+#endif /* CONFIG_PAGE_MAPCOUNT */
+
+/**
+ * folio_average_page_mapcount() - Average number of mappings per page in this
+ * folio
+ * @folio: The folio.
+ *
+ * The average number of user page table entries that reference each page in
+ * this folio as tracked via the RMAP: either referenced directly (PTE) or
+ * as part of a larger area that covers this page (e.g., PMD).
+ *
+ * The average is calculated by rounding to the nearest integer; however,
+ * to avoid duplicated code in current callers, the average is at least
+ * 1 if any page of the folio is mapped.
+ *
+ * Returns: The average number of mappings per page in this folio.
+ */
+static inline int folio_average_page_mapcount(struct folio *folio)
+{
+ int mapcount, entire_mapcount, avg;
+
+ if (!folio_test_large(folio))
+ return atomic_read(&folio->_mapcount) + 1;
+
+ mapcount = folio_large_mapcount(folio);
+ if (unlikely(mapcount <= 0))
+ return 0;
+ entire_mapcount = folio_entire_mapcount(folio);
+ if (mapcount <= entire_mapcount)
+ return entire_mapcount;
+ mapcount -= entire_mapcount;
+
+ /* Round to closest integer ... */
+ avg = ((unsigned int)mapcount + folio_large_nr_pages(folio) / 2) >> folio_large_order(folio);
+ /* ... but return at least 1. */
+ return max_t(int, avg + entire_mapcount, 1);
+}
/*
* array.c
*/
extern const struct file_operations proc_tid_children_operations;
+extern void proc_task_name(struct seq_file *m, struct task_struct *p,
+ bool escape);
extern int proc_tid_stat(struct seq_file *, struct pid_namespace *,
struct pid *, struct task_struct *);
extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *,
@@ -151,34 +255,39 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
* base.c
*/
extern const struct dentry_operations pid_dentry_operations;
-extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
-extern int proc_setattr(struct dentry *, struct iattr *);
+extern int pid_getattr(struct mnt_idmap *, const struct path *,
+ struct kstat *, u32, unsigned int);
+extern int proc_setattr(struct mnt_idmap *, struct dentry *,
+ struct iattr *);
+extern void proc_pid_evict_inode(struct proc_inode *);
extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
-extern int pid_revalidate(struct dentry *, unsigned int);
+extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
extern int proc_pid_readdir(struct file *, struct dir_context *);
-extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
+struct dentry *proc_pid_lookup(struct dentry *, unsigned int);
extern loff_t mem_lseek(struct file *, loff_t, int);
/* Lookups */
-typedef int instantiate_t(struct inode *, struct dentry *,
+typedef struct dentry *instantiate_t(struct dentry *,
struct task_struct *, const void *);
-extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
+bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int,
instantiate_t, struct task_struct *, const void *);
/*
* generic.c
*/
+struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
+ struct proc_dir_entry **parent, void *data);
+struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
+ struct proc_dir_entry *dp);
extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
-extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
- struct dentry *);
+struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *);
extern int proc_readdir(struct file *, struct dir_context *);
-extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *);
+int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *);
-static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
+static inline void pde_get(struct proc_dir_entry *pde)
{
- atomic_inc(&pde->count);
- return pde;
+ refcount_inc(&pde->refcnt);
}
extern void pde_put(struct proc_dir_entry *);
@@ -186,24 +295,25 @@ static inline bool is_empty_pde(const struct proc_dir_entry *pde)
{
return S_ISDIR(pde->mode) && !pde->proc_iops;
}
+extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *);
/*
* inode.c
*/
struct pde_opener {
- struct file *file;
struct list_head lh;
+ struct file *file;
bool closing;
struct completion *c;
-};
+} __randomize_layout;
extern const struct inode_operations proc_link_inode_operations;
-
extern const struct inode_operations proc_pid_link_inode_operations;
+extern const struct super_operations proc_sops;
-extern void proc_init_inodecache(void);
+void proc_init_kmemcache(void);
+void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
-extern int proc_fill_super(struct super_block *, void *data, int flags);
extern void proc_entry_rundown(struct proc_dir_entry *);
/*
@@ -261,21 +371,29 @@ static inline void proc_tty_init(void) {}
* root.c
*/
extern struct proc_dir_entry proc_root;
-extern int proc_parse_options(char *options, struct pid_namespace *pid);
extern void proc_self_init(void);
-extern int proc_remount(struct super_block *, int *, char *);
+extern unsigned self_inum, thread_self_inum;
/*
* task_[no]mmu.c
*/
+struct mem_size_stats;
+
+struct proc_maps_locking_ctx {
+ struct mm_struct *mm;
+#ifdef CONFIG_PER_VMA_LOCK
+ bool mmap_locked;
+ struct vm_area_struct *locked_vma;
+#endif
+};
+
struct proc_maps_private {
struct inode *inode;
struct task_struct *task;
- struct mm_struct *mm;
-#ifdef CONFIG_MMU
- struct vm_area_struct *tail_vma;
-#endif
+ struct vma_iterator iter;
+ loff_t last_pos;
+ struct proc_maps_locking_ctx lock_ctx;
#ifdef CONFIG_NUMA
struct mempolicy *task_mempolicy;
#endif
@@ -284,11 +402,9 @@ struct proc_maps_private {
struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);
extern const struct file_operations proc_pid_maps_operations;
-extern const struct file_operations proc_tid_maps_operations;
extern const struct file_operations proc_pid_numa_maps_operations;
-extern const struct file_operations proc_tid_numa_maps_operations;
extern const struct file_operations proc_pid_smaps_operations;
-extern const struct file_operations proc_tid_smaps_operations;
+extern const struct file_operations proc_pid_smaps_rollup_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;
@@ -297,3 +413,22 @@ extern unsigned long task_statm(struct mm_struct *,
unsigned long *, unsigned long *,
unsigned long *, unsigned long *);
extern void task_mem(struct seq_file *, struct mm_struct *);
+
+extern const struct dentry_operations proc_net_dentry_ops;
+static inline void pde_force_lookup(struct proc_dir_entry *pde)
+{
+ /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
+ pde->flags |= PROC_ENTRY_FORCE_LOOKUP;
+}
+
+/*
+ * Add a new procfs dentry that can't serve as a mountpoint. That should
+ * encompass anything that is ephemeral and can just disappear while the
+ * process is still around.
+ */
+static inline struct dentry *proc_splice_unmountable(struct inode *inode,
+ struct dentry *dentry, const struct dentry_operations *d_ops)
+{
+ dont_mount(dentry);
+ return d_splice_alias_ops(inode, dentry, d_ops);
+}
diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c
index a352d5703b41..714a22ded8a8 100644
--- a/fs/proc/interrupts.c
+++ b/fs/proc/interrupts.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/interrupt.h>
@@ -10,13 +11,13 @@
*/
static void *int_seq_start(struct seq_file *f, loff_t *pos)
{
- return (*pos <= nr_irqs) ? pos : NULL;
+ return *pos <= irq_get_nr_irqs() ? pos : NULL;
}
static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
{
(*pos)++;
- if (*pos > nr_irqs)
+ if (*pos > irq_get_nr_irqs())
return NULL;
return pos;
}
@@ -33,21 +34,9 @@ static const struct seq_operations int_seq_ops = {
.show = show_interrupts
};
-static int interrupts_open(struct inode *inode, struct file *filp)
-{
- return seq_open(filp, &int_seq_ops);
-}
-
-static const struct file_operations proc_interrupts_operations = {
- .open = interrupts_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static int __init proc_interrupts_init(void)
{
- proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
+ proc_create_seq("interrupts", 0, NULL, &int_seq_ops);
return 0;
}
fs_initcall(proc_interrupts_init);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 45629f4b5402..728630b10fdf 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/proc/kcore.c kernel ELF core dumper
*
@@ -9,6 +10,7 @@
* Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com>
*/
+#include <linux/vmcore_info.h>
#include <linux/mm.h>
#include <linux/proc_fs.h>
#include <linux/kcore.h>
@@ -16,24 +18,22 @@
#include <linux/capability.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
-#include <linux/notifier.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/printk.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/uaccess.h>
+#include <linux/uio.h>
#include <asm/io.h>
#include <linux/list.h>
#include <linux/ioport.h>
#include <linux/memory.h>
#include <linux/sched/task.h>
+#include <linux/security.h>
#include <asm/sections.h>
#include "internal.h"
-#define CORE_STR "CORE"
-
#ifndef ELF_CORE_EFLAGS
#define ELF_CORE_EFLAGS 0
#endif
@@ -48,104 +48,100 @@ static struct proc_dir_entry *proc_root_kcore;
#define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
#endif
-/* An ELF note in memory */
-struct memelfnote
+#ifndef kc_xlate_dev_mem_ptr
+#define kc_xlate_dev_mem_ptr kc_xlate_dev_mem_ptr
+static inline void *kc_xlate_dev_mem_ptr(phys_addr_t phys)
{
- const char *name;
- int type;
- unsigned int datasz;
- void *data;
-};
+ return __va(phys);
+}
+#endif
+#ifndef kc_unxlate_dev_mem_ptr
+#define kc_unxlate_dev_mem_ptr kc_unxlate_dev_mem_ptr
+static inline void kc_unxlate_dev_mem_ptr(phys_addr_t phys, void *virt)
+{
+}
+#endif
static LIST_HEAD(kclist_head);
-static DEFINE_RWLOCK(kclist_lock);
+static int kcore_nphdr;
+static size_t kcore_phdrs_len;
+static size_t kcore_notes_len;
+static size_t kcore_data_offset;
+DEFINE_STATIC_PERCPU_RWSEM(kclist_lock);
static int kcore_need_update = 1;
-void
-kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
+/*
+ * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
+ * Same as oldmem_pfn_is_ram in vmcore
+ */
+static int (*mem_pfn_is_ram)(unsigned long pfn);
+
+int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn))
+{
+ if (mem_pfn_is_ram)
+ return -EBUSY;
+ mem_pfn_is_ram = fn;
+ return 0;
+}
+
+static int pfn_is_ram(unsigned long pfn)
+{
+ if (mem_pfn_is_ram)
+ return mem_pfn_is_ram(pfn);
+ else
+ return 1;
+}
+
+/* This doesn't grab kclist_lock, so it should only be used at init time. */
+void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
+ int type)
{
new->addr = (unsigned long)addr;
new->size = size;
new->type = type;
- write_lock(&kclist_lock);
list_add_tail(&new->list, &kclist_head);
- write_unlock(&kclist_lock);
}
-static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
+static void update_kcore_size(void)
{
size_t try, size;
struct kcore_list *m;
- *nphdr = 1; /* PT_NOTE */
+ kcore_nphdr = 1; /* PT_NOTE */
size = 0;
list_for_each_entry(m, &kclist_head, list) {
try = kc_vaddr_to_offset((size_t)m->addr + m->size);
if (try > size)
size = try;
- *nphdr = *nphdr + 1;
- }
- *elf_buflen = sizeof(struct elfhdr) +
- (*nphdr + 2)*sizeof(struct elf_phdr) +
- 3 * ((sizeof(struct elf_note)) +
- roundup(sizeof(CORE_STR), 4)) +
- roundup(sizeof(struct elf_prstatus), 4) +
- roundup(sizeof(struct elf_prpsinfo), 4) +
- roundup(arch_task_struct_size, 4);
- *elf_buflen = PAGE_ALIGN(*elf_buflen);
- return size + *elf_buflen;
-}
-
-static void free_kclist_ents(struct list_head *head)
-{
- struct kcore_list *tmp, *pos;
-
- list_for_each_entry_safe(pos, tmp, head, list) {
- list_del(&pos->list);
- kfree(pos);
+ kcore_nphdr++;
}
-}
-/*
- * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list.
- */
-static void __kcore_update_ram(struct list_head *list)
-{
- int nphdr;
- size_t size;
- struct kcore_list *tmp, *pos;
- LIST_HEAD(garbage);
- write_lock(&kclist_lock);
- if (kcore_need_update) {
- list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
- if (pos->type == KCORE_RAM
- || pos->type == KCORE_VMEMMAP)
- list_move(&pos->list, &garbage);
- }
- list_splice_tail(list, &kclist_head);
- } else
- list_splice(list, &garbage);
- kcore_need_update = 0;
- proc_root_kcore->size = get_kcore_size(&nphdr, &size);
- write_unlock(&kclist_lock);
-
- free_kclist_ents(&garbage);
+ kcore_phdrs_len = kcore_nphdr * sizeof(struct elf_phdr);
+ kcore_notes_len = (4 * sizeof(struct elf_note) +
+ ALIGN(sizeof(NN_PRSTATUS), 4) +
+ ALIGN(sizeof(NN_PRPSINFO), 4) +
+ ALIGN(sizeof(NN_TASKSTRUCT), 4) +
+ VMCOREINFO_NOTE_NAME_BYTES +
+ ALIGN(sizeof(struct elf_prstatus), 4) +
+ ALIGN(sizeof(struct elf_prpsinfo), 4) +
+ ALIGN(arch_task_struct_size, 4) +
+ ALIGN(vmcoreinfo_size, 4));
+ kcore_data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + kcore_phdrs_len +
+ kcore_notes_len);
+ proc_root_kcore->size = kcore_data_offset + size;
}
-
#ifdef CONFIG_HIGHMEM
/*
* If no highmem, we can assume [0...max_low_pfn) continuous range of memory
* because memory hole is not as big as !HIGHMEM case.
* (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
*/
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *head)
{
- LIST_HEAD(head);
struct kcore_list *ent;
- int ret = 0;
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
@@ -153,9 +149,8 @@ static int kcore_update_ram(void)
ent->addr = (unsigned long)__va(0);
ent->size = max_low_pfn << PAGE_SHIFT;
ent->type = KCORE_RAM;
- list_add(&ent->list, &head);
- __kcore_update_ram(&head);
- return ret;
+ list_add(&ent->list, head);
+ return 0;
}
#else /* !CONFIG_HIGHMEM */
@@ -208,25 +203,32 @@ kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
{
struct list_head *head = (struct list_head *)arg;
struct kcore_list *ent;
+ struct page *p;
+
+ if (!pfn_valid(pfn))
+ return 1;
+
+ p = pfn_to_page(pfn);
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
return -ENOMEM;
- ent->addr = (unsigned long)__va((pfn << PAGE_SHIFT));
+ ent->addr = (unsigned long)page_to_virt(p);
ent->size = nr_pages << PAGE_SHIFT;
- /* Sanity check: Can happen in 32bit arch...maybe */
- if (ent->addr < (unsigned long) __va(0))
+ if (!virt_addr_valid((void *)ent->addr))
goto free_out;
/* cut not-mapped area. ....from ppc-32 code. */
if (ULONG_MAX - ent->addr < ent->size)
ent->size = ULONG_MAX - ent->addr;
- /* cut when vmalloc() area is higher than direct-map area */
- if (VMALLOC_START > (unsigned long)__va(0)) {
- if (ent->addr > VMALLOC_START)
- goto free_out;
+ /*
+ * We've already checked virt_addr_valid so we know this address
+ * is a valid pointer, therefore we can check against it to determine
+ * if we need to trim
+ */
+ if (VMALLOC_START > ent->addr) {
if (VMALLOC_START - ent->addr < ent->size)
ent->size = VMALLOC_START - ent->addr;
}
@@ -245,13 +247,12 @@ free_out:
return 1;
}
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *list)
{
int nid, ret;
unsigned long end_pfn;
- LIST_HEAD(head);
- /* Not inialized....update now */
+ /* Not initialized....update now */
/* find out "max pfn" */
end_pfn = 0;
for_each_node_state(nid, N_MEMORY) {
@@ -261,297 +262,386 @@ static int kcore_update_ram(void)
end_pfn = node_end;
}
/* scan 0 to max_pfn */
- ret = walk_system_ram_range(0, end_pfn, &head, kclist_add_private);
- if (ret) {
- free_kclist_ents(&head);
+ ret = walk_system_ram_range(0, end_pfn, list, kclist_add_private);
+ if (ret)
return -ENOMEM;
- }
- __kcore_update_ram(&head);
- return ret;
+ return 0;
}
#endif /* CONFIG_HIGHMEM */
-/*****************************************************************************/
-/*
- * determine size of ELF note
- */
-static int notesize(struct memelfnote *en)
-{
- int sz;
-
- sz = sizeof(struct elf_note);
- sz += roundup((strlen(en->name) + 1), 4);
- sz += roundup(en->datasz, 4);
-
- return sz;
-} /* end notesize() */
-
-/*****************************************************************************/
-/*
- * store a note in the header buffer
- */
-static char *storenote(struct memelfnote *men, char *bufp)
+static int kcore_update_ram(void)
{
- struct elf_note en;
+ LIST_HEAD(list);
+ LIST_HEAD(garbage);
+ struct kcore_list *tmp, *pos;
+ int ret = 0;
-#define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0)
+ percpu_down_write(&kclist_lock);
+ if (!xchg(&kcore_need_update, 0))
+ goto out;
- en.n_namesz = strlen(men->name) + 1;
- en.n_descsz = men->datasz;
- en.n_type = men->type;
+ ret = kcore_ram_list(&list);
+ if (ret) {
+ /* Couldn't get the RAM list, try again next time. */
+ WRITE_ONCE(kcore_need_update, 1);
+ list_splice_tail(&list, &garbage);
+ goto out;
+ }
- DUMP_WRITE(&en, sizeof(en));
- DUMP_WRITE(men->name, en.n_namesz);
+ list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
+ if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP)
+ list_move(&pos->list, &garbage);
+ }
+ list_splice_tail(&list, &kclist_head);
- /* XXX - cast from long long to long to avoid need for libgcc.a */
- bufp = (char*) roundup((unsigned long)bufp,4);
- DUMP_WRITE(men->data, men->datasz);
- bufp = (char*) roundup((unsigned long)bufp,4);
+ update_kcore_size();
-#undef DUMP_WRITE
+out:
+ percpu_up_write(&kclist_lock);
+ list_for_each_entry_safe(pos, tmp, &garbage, list) {
+ list_del(&pos->list);
+ kfree(pos);
+ }
+ return ret;
+}
- return bufp;
-} /* end storenote() */
+static void append_kcore_note(char *notes, size_t *i, const char *name,
+ unsigned int type, const void *desc,
+ size_t descsz)
+{
+ struct elf_note *note = (struct elf_note *)&notes[*i];
+
+ note->n_namesz = strlen(name) + 1;
+ note->n_descsz = descsz;
+ note->n_type = type;
+ *i += sizeof(*note);
+ memcpy(&notes[*i], name, note->n_namesz);
+ *i = ALIGN(*i + note->n_namesz, 4);
+ memcpy(&notes[*i], desc, descsz);
+ *i = ALIGN(*i + descsz, 4);
+}
-/*
- * store an ELF coredump header in the supplied buffer
- * nphdr is the number of elf_phdr to insert
- */
-static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
+static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- struct elf_prstatus prstatus; /* NT_PRSTATUS */
- struct elf_prpsinfo prpsinfo; /* NT_PRPSINFO */
- struct elf_phdr *nhdr, *phdr;
- struct elfhdr *elf;
- struct memelfnote notes[3];
- off_t offset = 0;
+ struct file *file = iocb->ki_filp;
+ char *buf = file->private_data;
+ loff_t *fpos = &iocb->ki_pos;
+ size_t phdrs_offset, notes_offset;
+ size_t page_offline_frozen = 1;
struct kcore_list *m;
+ size_t tsz;
+ unsigned long start;
+ size_t buflen = iov_iter_count(iter);
+ size_t orig_buflen = buflen;
+ int ret = 0;
- /* setup ELF header */
- elf = (struct elfhdr *) bufp;
- bufp += sizeof(struct elfhdr);
- offset += sizeof(struct elfhdr);
- memcpy(elf->e_ident, ELFMAG, SELFMAG);
- elf->e_ident[EI_CLASS] = ELF_CLASS;
- elf->e_ident[EI_DATA] = ELF_DATA;
- elf->e_ident[EI_VERSION]= EV_CURRENT;
- elf->e_ident[EI_OSABI] = ELF_OSABI;
- memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
- elf->e_type = ET_CORE;
- elf->e_machine = ELF_ARCH;
- elf->e_version = EV_CURRENT;
- elf->e_entry = 0;
- elf->e_phoff = sizeof(struct elfhdr);
- elf->e_shoff = 0;
- elf->e_flags = ELF_CORE_EFLAGS;
- elf->e_ehsize = sizeof(struct elfhdr);
- elf->e_phentsize= sizeof(struct elf_phdr);
- elf->e_phnum = nphdr;
- elf->e_shentsize= 0;
- elf->e_shnum = 0;
- elf->e_shstrndx = 0;
-
- /* setup ELF PT_NOTE program header */
- nhdr = (struct elf_phdr *) bufp;
- bufp += sizeof(struct elf_phdr);
- offset += sizeof(struct elf_phdr);
- nhdr->p_type = PT_NOTE;
- nhdr->p_offset = 0;
- nhdr->p_vaddr = 0;
- nhdr->p_paddr = 0;
- nhdr->p_filesz = 0;
- nhdr->p_memsz = 0;
- nhdr->p_flags = 0;
- nhdr->p_align = 0;
-
- /* setup ELF PT_LOAD program header for every area */
- list_for_each_entry(m, &kclist_head, list) {
- phdr = (struct elf_phdr *) bufp;
- bufp += sizeof(struct elf_phdr);
- offset += sizeof(struct elf_phdr);
-
- phdr->p_type = PT_LOAD;
- phdr->p_flags = PF_R|PF_W|PF_X;
- phdr->p_offset = kc_vaddr_to_offset(m->addr) + dataoff;
- phdr->p_vaddr = (size_t)m->addr;
- if (m->type == KCORE_RAM || m->type == KCORE_TEXT)
- phdr->p_paddr = __pa(m->addr);
- else
- phdr->p_paddr = (elf_addr_t)-1;
- phdr->p_filesz = phdr->p_memsz = m->size;
- phdr->p_align = PAGE_SIZE;
- }
-
+ percpu_down_read(&kclist_lock);
/*
- * Set up the notes in similar form to SVR4 core dumps made
- * with info from their /proc.
+ * Don't race against drivers that set PageOffline() and expect no
+ * further page access.
*/
- nhdr->p_offset = offset;
-
- /* set up the process status */
- notes[0].name = CORE_STR;
- notes[0].type = NT_PRSTATUS;
- notes[0].datasz = sizeof(struct elf_prstatus);
- notes[0].data = &prstatus;
-
- memset(&prstatus, 0, sizeof(struct elf_prstatus));
-
- nhdr->p_filesz = notesize(&notes[0]);
- bufp = storenote(&notes[0], bufp);
-
- /* set up the process info */
- notes[1].name = CORE_STR;
- notes[1].type = NT_PRPSINFO;
- notes[1].datasz = sizeof(struct elf_prpsinfo);
- notes[1].data = &prpsinfo;
-
- memset(&prpsinfo, 0, sizeof(struct elf_prpsinfo));
- prpsinfo.pr_state = 0;
- prpsinfo.pr_sname = 'R';
- prpsinfo.pr_zomb = 0;
-
- strcpy(prpsinfo.pr_fname, "vmlinux");
- strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs));
+ page_offline_freeze();
+
+ phdrs_offset = sizeof(struct elfhdr);
+ notes_offset = phdrs_offset + kcore_phdrs_len;
+
+ /* ELF file header. */
+ if (buflen && *fpos < sizeof(struct elfhdr)) {
+ struct elfhdr ehdr = {
+ .e_ident = {
+ [EI_MAG0] = ELFMAG0,
+ [EI_MAG1] = ELFMAG1,
+ [EI_MAG2] = ELFMAG2,
+ [EI_MAG3] = ELFMAG3,
+ [EI_CLASS] = ELF_CLASS,
+ [EI_DATA] = ELF_DATA,
+ [EI_VERSION] = EV_CURRENT,
+ [EI_OSABI] = ELF_OSABI,
+ },
+ .e_type = ET_CORE,
+ .e_machine = ELF_ARCH,
+ .e_version = EV_CURRENT,
+ .e_phoff = sizeof(struct elfhdr),
+ .e_flags = ELF_CORE_EFLAGS,
+ .e_ehsize = sizeof(struct elfhdr),
+ .e_phentsize = sizeof(struct elf_phdr),
+ .e_phnum = kcore_nphdr,
+ };
+
+ tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos);
+ if (copy_to_iter((char *)&ehdr + *fpos, tsz, iter) != tsz) {
+ ret = -EFAULT;
+ goto out;
+ }
- nhdr->p_filesz += notesize(&notes[1]);
- bufp = storenote(&notes[1], bufp);
+ buflen -= tsz;
+ *fpos += tsz;
+ }
- /* set up the task structure */
- notes[2].name = CORE_STR;
- notes[2].type = NT_TASKSTRUCT;
- notes[2].datasz = arch_task_struct_size;
- notes[2].data = current;
+ /* ELF program headers. */
+ if (buflen && *fpos < phdrs_offset + kcore_phdrs_len) {
+ struct elf_phdr *phdrs, *phdr;
- nhdr->p_filesz += notesize(&notes[2]);
- bufp = storenote(&notes[2], bufp);
+ phdrs = kzalloc(kcore_phdrs_len, GFP_KERNEL);
+ if (!phdrs) {
+ ret = -ENOMEM;
+ goto out;
+ }
-} /* end elf_kcore_store_hdr() */
+ phdrs[0].p_type = PT_NOTE;
+ phdrs[0].p_offset = notes_offset;
+ phdrs[0].p_filesz = kcore_notes_len;
-/*****************************************************************************/
-/*
- * read from the ELF header and then kernel memory
- */
-static ssize_t
-read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
-{
- char *buf = file->private_data;
- ssize_t acc = 0;
- size_t size, tsz;
- size_t elf_buflen;
- int nphdr;
- unsigned long start;
+ phdr = &phdrs[1];
+ list_for_each_entry(m, &kclist_head, list) {
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R | PF_W | PF_X;
+ phdr->p_offset = kc_vaddr_to_offset(m->addr)
+ + kcore_data_offset;
+ phdr->p_vaddr = (size_t)m->addr;
+ if (m->type == KCORE_RAM)
+ phdr->p_paddr = __pa(m->addr);
+ else if (m->type == KCORE_TEXT)
+ phdr->p_paddr = __pa_symbol(m->addr);
+ else
+ phdr->p_paddr = (elf_addr_t)-1;
+ phdr->p_filesz = phdr->p_memsz = m->size;
+ phdr->p_align = PAGE_SIZE;
+ phdr++;
+ }
- read_lock(&kclist_lock);
- size = get_kcore_size(&nphdr, &elf_buflen);
+ tsz = min_t(size_t, buflen,
+ phdrs_offset + kcore_phdrs_len - *fpos);
+ if (copy_to_iter((char *)phdrs + *fpos - phdrs_offset, tsz,
+ iter) != tsz) {
+ kfree(phdrs);
+ ret = -EFAULT;
+ goto out;
+ }
+ kfree(phdrs);
- if (buflen == 0 || *fpos >= size) {
- read_unlock(&kclist_lock);
- return 0;
+ buflen -= tsz;
+ *fpos += tsz;
}
- /* trim buflen to not go beyond EOF */
- if (buflen > size - *fpos)
- buflen = size - *fpos;
-
- /* construct an ELF core header if we'll need some of it */
- if (*fpos < elf_buflen) {
- char * elf_buf;
-
- tsz = elf_buflen - *fpos;
- if (buflen < tsz)
- tsz = buflen;
- elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
- if (!elf_buf) {
- read_unlock(&kclist_lock);
- return -ENOMEM;
+ /* ELF note segment. */
+ if (buflen && *fpos < notes_offset + kcore_notes_len) {
+ struct elf_prstatus prstatus = {};
+ struct elf_prpsinfo prpsinfo = {
+ .pr_sname = 'R',
+ .pr_fname = "vmlinux",
+ };
+ char *notes;
+ size_t i = 0;
+
+ strscpy(prpsinfo.pr_psargs, saved_command_line,
+ sizeof(prpsinfo.pr_psargs));
+
+ notes = kzalloc(kcore_notes_len, GFP_KERNEL);
+ if (!notes) {
+ ret = -ENOMEM;
+ goto out;
}
- elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
- read_unlock(&kclist_lock);
- if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
- kfree(elf_buf);
- return -EFAULT;
+
+ append_kcore_note(notes, &i, NN_PRSTATUS, NT_PRSTATUS, &prstatus,
+ sizeof(prstatus));
+ append_kcore_note(notes, &i, NN_PRPSINFO, NT_PRPSINFO, &prpsinfo,
+ sizeof(prpsinfo));
+ append_kcore_note(notes, &i, NN_TASKSTRUCT, NT_TASKSTRUCT, current,
+ arch_task_struct_size);
+ /*
+ * vmcoreinfo_size is mostly constant after init time, but it
+ * can be changed by crash_save_vmcoreinfo(). Racing here with a
+ * panic on another CPU before the machine goes down is insanely
+ * unlikely, but it's better to not leave potential buffer
+ * overflows lying around, regardless.
+ */
+ append_kcore_note(notes, &i, VMCOREINFO_NOTE_NAME, 0,
+ vmcoreinfo_data,
+ min(vmcoreinfo_size, kcore_notes_len - i));
+
+ tsz = min_t(size_t, buflen,
+ notes_offset + kcore_notes_len - *fpos);
+ if (copy_to_iter(notes + *fpos - notes_offset, tsz, iter) != tsz) {
+ kfree(notes);
+ ret = -EFAULT;
+ goto out;
}
- kfree(elf_buf);
+ kfree(notes);
+
buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
- acc += tsz;
-
- /* leave now if filled buffer already */
- if (buflen == 0)
- return acc;
- } else
- read_unlock(&kclist_lock);
+ }
/*
* Check to see if our file offset matches with any of
* the addresses in the elf_phdr on our list.
*/
- start = kc_offset_to_vaddr(*fpos - elf_buflen);
+ start = kc_offset_to_vaddr(*fpos - kcore_data_offset);
if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
tsz = buflen;
-
+
+ m = NULL;
while (buflen) {
- struct kcore_list *m;
+ struct page *page;
+ unsigned long pfn;
+ phys_addr_t phys;
+ void *__start;
+
+ /*
+ * If this is the first iteration or the address is not within
+ * the previous entry, search for a matching entry.
+ */
+ if (!m || start < m->addr || start >= m->addr + m->size) {
+ struct kcore_list *pos;
+
+ m = NULL;
+ list_for_each_entry(pos, &kclist_head, list) {
+ if (start >= pos->addr &&
+ start < pos->addr + pos->size) {
+ m = pos;
+ break;
+ }
+ }
+ }
- read_lock(&kclist_lock);
- list_for_each_entry(m, &kclist_head, list) {
- if (start >= m->addr && start < (m->addr+m->size))
- break;
+ if (page_offline_frozen++ % MAX_ORDER_NR_PAGES == 0) {
+ page_offline_thaw();
+ cond_resched();
+ page_offline_freeze();
}
- read_unlock(&kclist_lock);
-
- if (&m->list == &kclist_head) {
- if (clear_user(buffer, tsz))
- return -EFAULT;
- } else if (m->type == KCORE_VMALLOC) {
- vread(buf, (char *)start, tsz);
- /* we have to zero-fill user buffer even if no read */
- if (copy_to_user(buffer, buf, tsz))
- return -EFAULT;
- } else {
- if (kern_addr_valid(start)) {
- unsigned long n;
-
- /*
- * Using bounce buffer to bypass the
- * hardened user copy kernel text checks.
- */
- memcpy(buf, (char *) start, tsz);
- n = copy_to_user(buffer, buf, tsz);
- /*
- * We cannot distinguish between fault on source
- * and fault on destination. When this happens
- * we clear too and hope it will trigger the
- * EFAULT again.
- */
- if (n) {
- if (clear_user(buffer + tsz - n,
- n))
- return -EFAULT;
+
+ if (!m) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
+ ret = -EFAULT;
+ goto out;
+ }
+ goto skip;
+ }
+
+ switch (m->type) {
+ case KCORE_VMALLOC:
+ {
+ const char *src = (char *)start;
+ size_t read = 0, left = tsz;
+
+ /*
+ * vmalloc uses spinlocks, so we optimistically try to
+ * read memory. If this fails, fault pages in and try
+ * again until we are done.
+ */
+ while (true) {
+ read += vread_iter(iter, src, left);
+ if (read == tsz)
+ break;
+
+ src += read;
+ left -= read;
+
+ if (fault_in_iov_iter_writeable(iter, left)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+ break;
+ }
+ case KCORE_USER:
+ /* User page is handled prior to normal kernel page: */
+ if (copy_to_iter((char *)start, tsz, iter) != tsz) {
+ ret = -EFAULT;
+ goto out;
+ }
+ break;
+ case KCORE_RAM:
+ phys = __pa(start);
+ pfn = phys >> PAGE_SHIFT;
+ page = pfn_to_online_page(pfn);
+
+ /*
+ * Don't read offline sections, logically offline pages
+ * (e.g., inflated in a balloon), hwpoisoned pages,
+ * and explicitly excluded physical ranges.
+ */
+ if (!page || PageOffline(page) ||
+ is_page_hwpoison(page) || !pfn_is_ram(pfn) ||
+ pfn_is_unaccepted_memory(pfn)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
+ ret = -EFAULT;
+ goto out;
+ }
+ break;
+ }
+ fallthrough;
+ case KCORE_VMEMMAP:
+ case KCORE_TEXT:
+ if (m->type == KCORE_RAM) {
+ __start = kc_xlate_dev_mem_ptr(phys);
+ if (!__start) {
+ ret = -ENOMEM;
+ if (iov_iter_zero(tsz, iter) != tsz)
+ ret = -EFAULT;
+ goto out;
}
} else {
- if (clear_user(buffer, tsz))
- return -EFAULT;
+ __start = (void *)start;
+ }
+
+ /*
+ * Sadly we must use a bounce buffer here to be able to
+ * make use of copy_from_kernel_nofault(), as these
+ * memory regions might not always be mapped on all
+ * architectures.
+ */
+ ret = copy_from_kernel_nofault(buf, __start, tsz);
+ if (m->type == KCORE_RAM)
+ kc_unxlate_dev_mem_ptr(phys, __start);
+ if (ret) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = 0;
+ /*
+ * We know the bounce buffer is safe to copy from, so
+ * use _copy_to_iter() directly.
+ */
+ } else if (_copy_to_iter(buf, tsz, iter) != tsz) {
+ ret = -EFAULT;
+ goto out;
+ }
+ break;
+ default:
+ pr_warn_once("Unhandled KCORE type: %d\n", m->type);
+ if (iov_iter_zero(tsz, iter) != tsz) {
+ ret = -EFAULT;
+ goto out;
}
}
+skip:
buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
- acc += tsz;
start += tsz;
tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);
}
- return acc;
+out:
+ page_offline_thaw();
+ percpu_up_read(&kclist_lock);
+ if (ret)
+ return ret;
+ return orig_buflen - buflen;
}
-
static int open_kcore(struct inode *inode, struct file *filp)
{
+ int ret = security_locked_down(LOCKDOWN_KCORE);
+
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
+ if (ret)
+ return ret;
+
filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!filp->private_data)
return -ENOMEM;
@@ -572,11 +662,12 @@ static int release_kcore(struct inode *inode, struct file *file)
return 0;
}
-static const struct file_operations proc_kcore_operations = {
- .read = read_kcore,
- .open = open_kcore,
- .release = release_kcore,
- .llseek = default_llseek,
+static const struct proc_ops kcore_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_read_iter = read_kcore_iter,
+ .proc_open = open_kcore,
+ .proc_release = release_kcore,
+ .proc_lseek = default_llseek,
};
/* just remember that we have to update kcore */
@@ -586,17 +677,12 @@ static int __meminit kcore_callback(struct notifier_block *self,
switch (action) {
case MEM_ONLINE:
case MEM_OFFLINE:
- write_lock(&kclist_lock);
kcore_need_update = 1;
- write_unlock(&kclist_lock);
+ break;
}
return NOTIFY_OK;
}
-static struct notifier_block kcore_callback_nb __meminitdata = {
- .notifier_call = kcore_callback,
- .priority = 0,
-};
static struct kcore_list kcore_vmalloc;
@@ -620,7 +706,7 @@ static void __init proc_kcore_text_init(void)
/*
* MODULES_VADDR has no intersection with VMALLOC_ADDR.
*/
-struct kcore_list kcore_modules;
+static struct kcore_list kcore_modules;
static void __init add_modules_range(void)
{
if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) {
@@ -636,8 +722,7 @@ static void __init add_modules_range(void)
static int __init proc_kcore_init(void)
{
- proc_root_kcore = proc_create("kcore", S_IRUSR, NULL,
- &proc_kcore_operations);
+ proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &kcore_proc_ops);
if (!proc_root_kcore) {
pr_err("couldn't create /proc/kcore\n");
return 0; /* Always returns 0. */
@@ -650,7 +735,7 @@ static int __init proc_kcore_init(void)
add_modules_range();
/* Store direct-map area from physical memory map */
kcore_update_ram();
- register_hotmemory_notifier(&kcore_callback_nb);
+ hotplug_memory_notifier(kcore_callback, DEFAULT_CALLBACK_PRI);
return 0;
}
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index f9387bb7631b..2fc92a13f9f8 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/proc/kmsg.c
*
@@ -14,11 +15,8 @@
#include <linux/fs.h>
#include <linux/syslog.h>
-#include <linux/uaccess.h>
#include <asm/io.h>
-extern wait_queue_head_t log_wait;
-
static int kmsg_open(struct inode * inode, struct file * file)
{
return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC);
@@ -39,26 +37,27 @@ static ssize_t kmsg_read(struct file *file, char __user *buf,
return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC);
}
-static unsigned int kmsg_poll(struct file *file, poll_table *wait)
+static __poll_t kmsg_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &log_wait, wait);
if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
- return POLLIN | POLLRDNORM;
+ return EPOLLIN | EPOLLRDNORM;
return 0;
}
-static const struct file_operations proc_kmsg_operations = {
- .read = kmsg_read,
- .poll = kmsg_poll,
- .open = kmsg_open,
- .release = kmsg_release,
- .llseek = generic_file_llseek,
+static const struct proc_ops kmsg_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_read = kmsg_read,
+ .proc_poll = kmsg_poll,
+ .proc_open = kmsg_open,
+ .proc_release = kmsg_release,
+ .proc_lseek = generic_file_llseek,
};
static int __init proc_kmsg_init(void)
{
- proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
+ proc_create("kmsg", S_IRUSR, NULL, &kmsg_proc_ops);
return 0;
}
fs_initcall(proc_kmsg_init);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 983fce5c2418..817981e57223 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/pid_namespace.h>
@@ -8,9 +9,7 @@
#include <linux/seq_file.h>
#include <linux/seqlock.h>
#include <linux/time.h>
-
-#define LOAD_INT(x) ((x) >> FSHIFT)
-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+#include "internal.h"
static int loadavg_proc_show(struct seq_file *m, void *v)
{
@@ -18,30 +17,21 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
get_avenrun(avnrun, FIXED_1/200, 0);
- seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
+ seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %u/%d %d\n",
LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
nr_running(), nr_threads,
- task_active_pid_ns(current)->last_pid);
+ idr_get_cursor(&task_active_pid_ns(current)->idr) - 1);
return 0;
}
-static int loadavg_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, loadavg_proc_show, NULL);
-}
-
-static const struct file_operations loadavg_proc_fops = {
- .open = loadavg_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int __init proc_loadavg_init(void)
{
- proc_create("loadavg", 0, NULL, &loadavg_proc_fops);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("loadavg", 0, NULL, loadavg_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_loadavg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 8a428498d6b2..a458f1e112fd 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -5,8 +6,9 @@
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/mmzone.h>
+#include <linux/memblock.h>
#include <linux/proc_fs.h>
-#include <linux/quicklist.h>
+#include <linux/percpu.h>
#include <linux/seq_file.h>
#include <linux/swap.h>
#include <linux/vmstat.h>
@@ -15,8 +17,8 @@
#ifdef CONFIG_CMA
#include <linux/cma.h>
#endif
+#include <linux/zswap.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
#include "internal.h"
void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
@@ -25,20 +27,7 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
static void show_val_kb(struct seq_file *m, const char *s, unsigned long num)
{
- char v[32];
- static const char blanks[7] = {' ', ' ', ' ', ' ',' ', ' ', ' '};
- int len;
-
- len = num_to_str(v, sizeof(v), num << (PAGE_SHIFT - 10));
-
- seq_write(m, s, 16);
-
- if (len > 0) {
- if (len < 8)
- seq_write(m, blanks, 8 - len);
-
- seq_write(m, v, len);
- }
+ seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8);
seq_write(m, " kB\n", 4);
}
@@ -49,11 +38,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
long cached;
long available;
unsigned long pages[NR_LRU_LISTS];
+ unsigned long sreclaimable, sunreclaim;
int lru;
si_meminfo(&i);
si_swapinfo(&i);
- committed = percpu_counter_read_positive(&vm_committed_as);
+ committed = vm_memory_committed();
cached = global_node_page_state(NR_FILE_PAGES) -
total_swapcache_pages() - i.bufferram;
@@ -64,6 +54,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
available = si_mem_available();
+ sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B);
+ sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B);
show_val_kb(m, "MemTotal: ", i.totalram);
show_val_kb(m, "MemFree: ", i.freeram);
@@ -80,7 +72,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]);
show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]);
show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]);
- show_val_kb(m, "Mlocked: ", global_page_state(NR_MLOCK));
+ show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK));
#ifdef CONFIG_HIGHMEM
show_val_kb(m, "HighTotal: ", i.totalhigh);
@@ -96,6 +88,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "SwapTotal: ", i.totalswap);
show_val_kb(m, "SwapFree: ", i.freeswap);
+#ifdef CONFIG_ZSWAP
+ show_val_kb(m, "Zswap: ", zswap_total_pages());
+ seq_printf(m, "Zswapped: %8lu kB\n",
+ (unsigned long)atomic_long_read(&zswap_stored_pages) <<
+ (PAGE_SHIFT - 10));
+#endif
show_val_kb(m, "Dirty: ",
global_node_page_state(NR_FILE_DIRTY));
show_val_kb(m, "Writeback: ",
@@ -105,34 +103,34 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "Mapped: ",
global_node_page_state(NR_FILE_MAPPED));
show_val_kb(m, "Shmem: ", i.sharedram);
- show_val_kb(m, "Slab: ",
- global_page_state(NR_SLAB_RECLAIMABLE) +
- global_page_state(NR_SLAB_UNRECLAIMABLE));
-
- show_val_kb(m, "SReclaimable: ",
- global_page_state(NR_SLAB_RECLAIMABLE));
- show_val_kb(m, "SUnreclaim: ",
- global_page_state(NR_SLAB_UNRECLAIMABLE));
+ show_val_kb(m, "KReclaimable: ", sreclaimable +
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE));
+ show_val_kb(m, "Slab: ", sreclaimable + sunreclaim);
+ show_val_kb(m, "SReclaimable: ", sreclaimable);
+ show_val_kb(m, "SUnreclaim: ", sunreclaim);
seq_printf(m, "KernelStack: %8lu kB\n",
- global_page_state(NR_KERNEL_STACK_KB));
- show_val_kb(m, "PageTables: ",
- global_page_state(NR_PAGETABLE));
-#ifdef CONFIG_QUICKLIST
- show_val_kb(m, "Quicklists: ", quicklist_total_size());
+ global_node_page_state(NR_KERNEL_STACK_KB));
+#ifdef CONFIG_SHADOW_CALL_STACK
+ seq_printf(m, "ShadowCallStack:%8lu kB\n",
+ global_node_page_state(NR_KERNEL_SCS_KB));
#endif
+ show_val_kb(m, "PageTables: ",
+ global_node_page_state(NR_PAGETABLE));
+ show_val_kb(m, "SecPageTables: ",
+ global_node_page_state(NR_SECONDARY_PAGETABLE));
- show_val_kb(m, "NFS_Unstable: ",
- global_node_page_state(NR_UNSTABLE_NFS));
- show_val_kb(m, "Bounce: ",
- global_page_state(NR_BOUNCE));
- show_val_kb(m, "WritebackTmp: ",
- global_node_page_state(NR_WRITEBACK_TEMP));
+ show_val_kb(m, "NFS_Unstable: ", 0);
+ show_val_kb(m, "Bounce: ", 0);
+ show_val_kb(m, "WritebackTmp: ", 0);
show_val_kb(m, "CommitLimit: ", vm_commit_limit());
show_val_kb(m, "Committed_AS: ", committed);
seq_printf(m, "VmallocTotal: %8lu kB\n",
(unsigned long)VMALLOC_TOTAL >> 10);
- show_val_kb(m, "VmallocUsed: ", 0ul);
+ show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages());
show_val_kb(m, "VmallocChunk: ", 0ul);
+ show_val_kb(m, "Percpu: ", pcpu_nr_pages());
+
+ memtest_report_meminfo(m);
#ifdef CONFIG_MEMORY_FAILURE
seq_printf(m, "HardwareCorrupted: %5lu kB\n",
@@ -141,19 +139,30 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
show_val_kb(m, "AnonHugePages: ",
- global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR);
+ global_node_page_state(NR_ANON_THPS));
show_val_kb(m, "ShmemHugePages: ",
- global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
+ global_node_page_state(NR_SHMEM_THPS));
show_val_kb(m, "ShmemPmdMapped: ",
- global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
+ global_node_page_state(NR_SHMEM_PMDMAPPED));
+ show_val_kb(m, "FileHugePages: ",
+ global_node_page_state(NR_FILE_THPS));
+ show_val_kb(m, "FilePmdMapped: ",
+ global_node_page_state(NR_FILE_PMDMAPPED));
#endif
#ifdef CONFIG_CMA
show_val_kb(m, "CmaTotal: ", totalcma_pages);
show_val_kb(m, "CmaFree: ",
- global_page_state(NR_FREE_CMA_PAGES));
+ global_zone_page_state(NR_FREE_CMA_PAGES));
#endif
+#ifdef CONFIG_UNACCEPTED_MEMORY
+ show_val_kb(m, "Unaccepted: ",
+ global_zone_page_state(NR_UNACCEPTED));
+#endif
+ show_val_kb(m, "Balloon: ",
+ global_node_page_state(NR_BALLOON_PAGES));
+
hugetlb_report_meminfo(m);
arch_report_meminfo(m);
@@ -161,21 +170,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
return 0;
}
-static int meminfo_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, meminfo_proc_show, NULL);
-}
-
-static const struct file_operations meminfo_proc_fops = {
- .open = meminfo_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int __init proc_meminfo_init(void)
{
- proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("meminfo", 0, NULL, meminfo_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_meminfo_init);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 3803b24ca220..ea2b597fd92c 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/proc_fs.h>
#include <linux/nsproxy.h>
#include <linux/ptrace.h>
@@ -11,7 +12,7 @@
#include "internal.h"
-static const struct proc_ns_operations *ns_entries[] = {
+static const struct proc_ns_operations *const ns_entries[] = {
#ifdef CONFIG_NET_NS
&netns_operations,
#endif
@@ -32,6 +33,10 @@ static const struct proc_ns_operations *ns_entries[] = {
#ifdef CONFIG_CGROUPS
&cgroupns_operations,
#endif
+#ifdef CONFIG_TIME_NS
+ &timens_operations,
+ &timens_for_children_operations,
+#endif
};
static const char *proc_ns_get_link(struct dentry *dentry,
@@ -41,22 +46,26 @@ static const char *proc_ns_get_link(struct dentry *dentry,
const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
struct task_struct *task;
struct path ns_path;
- void *error = ERR_PTR(-EACCES);
+ int error = -EACCES;
if (!dentry)
return ERR_PTR(-ECHILD);
task = get_proc_task(inode);
if (!task)
- return error;
+ return ERR_PTR(-EACCES);
- if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
- error = ns_get_path(&ns_path, task, ns_ops);
- if (!error)
- nd_jump_link(&ns_path);
- }
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ goto out;
+
+ error = ns_get_path(&ns_path, task, ns_ops);
+ if (error)
+ goto out;
+
+ error = nd_jump_link(&ns_path);
+out:
put_task_struct(task);
- return error;
+ return ERR_PTR(error);
}
static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
@@ -74,7 +83,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
res = ns_get_name(name, sizeof(name), task, ns_ops);
if (res >= 0)
- res = readlink_copy(buffer, buflen, name);
+ res = readlink_copy(buffer, buflen, name, strlen(name));
}
put_task_struct(task);
return res;
@@ -86,34 +95,29 @@ static const struct inode_operations proc_ns_link_inode_operations = {
.setattr = proc_setattr,
};
-static int proc_ns_instantiate(struct inode *dir,
- struct dentry *dentry, struct task_struct *task, const void *ptr)
+static struct dentry *proc_ns_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
{
const struct proc_ns_operations *ns_ops = ptr;
struct inode *inode;
struct proc_inode *ei;
- inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK | S_IRWXUGO);
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOENT);
ei = PROC_I(inode);
inode->i_op = &proc_ns_link_inode_operations;
ei->ns_ops = ns_ops;
+ pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
- d_add(dentry, inode);
- /* Close the race of the process dying before we return the dentry */
- if (pid_revalidate(dentry, 0))
- return 0;
-out:
- return -ENOENT;
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
{
struct task_struct *task = get_proc_task(file_inode(file));
- const struct proc_ns_operations **entry, **last;
+ const struct proc_ns_operations *const *entry, *const *last;
if (!task)
return -ENOENT;
@@ -146,12 +150,10 @@ const struct file_operations proc_ns_dir_operations = {
static struct dentry *proc_ns_dir_lookup(struct inode *dir,
struct dentry *dentry, unsigned int flags)
{
- int error;
struct task_struct *task = get_proc_task(dir);
- const struct proc_ns_operations **entry, **last;
+ const struct proc_ns_operations *const *entry, *const *last;
unsigned int len = dentry->d_name.len;
-
- error = -ENOENT;
+ struct dentry *res = ERR_PTR(-ENOENT);
if (!task)
goto out_no_task;
@@ -166,11 +168,11 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
if (entry == last)
goto out;
- error = proc_ns_instantiate(dir, dentry, task, *entry);
+ res = proc_ns_instantiate(dentry, task, *entry);
out:
put_task_struct(task);
out_no_task:
- return ERR_PTR(error);
+ return res;
}
const struct inode_operations proc_ns_dir_inode_operations = {
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 75634379f82e..c6e7ebc63756 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* nommu.c: mmu-less memory info files
*
* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/init.h>
@@ -25,8 +21,6 @@
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
-#include <linux/uaccess.h>
-#include <asm/pgtable.h>
#include <asm/tlb.h>
#include <asm/div64.h>
#include "internal.h"
@@ -64,7 +58,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
if (file) {
seq_pad(m, ' ');
- seq_file_path(m, file, "");
+ seq_path(m, file_user_path(file), "");
}
seq_putc(m, '\n');
@@ -113,21 +107,9 @@ static const struct seq_operations proc_nommu_region_list_seqop = {
.show = nommu_region_list_show
};
-static int proc_nommu_region_list_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &proc_nommu_region_list_seqop);
-}
-
-static const struct file_operations proc_nommu_region_list_operations = {
- .open = proc_nommu_region_list_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static int __init proc_nommu_init(void)
{
- proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations);
+ proc_create_seq("maps", S_IRUGO, NULL, &proc_nommu_region_list_seqop);
return 0;
}
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 2726536489b1..f9b2c2c906cd 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -1,4 +1,5 @@
-#include <linux/bootmem.h>
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/init.h>
@@ -9,6 +10,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
+#include <linux/memremap.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
@@ -18,39 +20,89 @@
#define KPMSIZE sizeof(u64)
#define KPMMASK (KPMSIZE - 1)
-#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
-/* /proc/kpagecount - an array exposing page counts
- *
- * Each entry is a u64 representing the corresponding
- * physical page count.
- */
-static ssize_t kpagecount_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+enum kpage_operation {
+ KPAGE_FLAGS,
+ KPAGE_COUNT,
+ KPAGE_CGROUP,
+};
+
+static inline unsigned long get_max_dump_pfn(void)
+{
+#ifdef CONFIG_SPARSEMEM
+ /*
+ * The memmap of early sections is completely populated and marked
+ * online even if max_pfn does not fall on a section boundary -
+ * pfn_to_online_page() will succeed on all pages. Allow inspecting
+ * these memmaps.
+ */
+ return round_up(max_pfn, PAGES_PER_SECTION);
+#else
+ return max_pfn;
+#endif
+}
+
+static u64 get_kpage_count(const struct page *page)
+{
+ struct page_snapshot ps;
+ u64 ret;
+
+ snapshot_page(&ps, page);
+
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ ret = folio_precise_page_mapcount(&ps.folio_snapshot,
+ &ps.page_snapshot);
+ else
+ ret = folio_average_page_mapcount(&ps.folio_snapshot);
+
+ return ret;
+}
+
+static ssize_t kpage_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos,
+ enum kpage_operation op)
{
+ const unsigned long max_dump_pfn = get_max_dump_pfn();
u64 __user *out = (u64 __user *)buf;
- struct page *ppage;
+ struct page *page;
unsigned long src = *ppos;
unsigned long pfn;
ssize_t ret = 0;
- u64 pcount;
+ u64 info;
pfn = src / KPMSIZE;
- count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
if (src & KPMMASK || count & KPMMASK)
return -EINVAL;
+ if (src >= max_dump_pfn * KPMSIZE)
+ return 0;
+ count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
while (count > 0) {
- if (pfn_valid(pfn))
- ppage = pfn_to_page(pfn);
- else
- ppage = NULL;
- if (!ppage || PageSlab(ppage))
- pcount = 0;
- else
- pcount = page_mapcount(ppage);
-
- if (put_user(pcount, out)) {
+ /*
+ * TODO: ZONE_DEVICE support requires to identify
+ * memmaps that were actually initialized.
+ */
+ page = pfn_to_online_page(pfn);
+
+ if (page) {
+ switch (op) {
+ case KPAGE_FLAGS:
+ info = stable_page_flags(page);
+ break;
+ case KPAGE_COUNT:
+ info = get_kpage_count(page);
+ break;
+ case KPAGE_CGROUP:
+ info = page_cgroup_ino(page);
+ break;
+ default:
+ info = 0;
+ break;
+ }
+ } else
+ info = 0;
+
+ if (put_user(info, out)) {
ret = -EFAULT;
break;
}
@@ -68,26 +120,37 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
return ret;
}
-static const struct file_operations proc_kpagecount_operations = {
- .llseek = mem_lseek,
- .read = kpagecount_read,
-};
-
-/* /proc/kpageflags - an array exposing page flags
+/* /proc/kpagecount - an array exposing page mapcounts
*
* Each entry is a u64 representing the corresponding
- * physical page flags.
+ * physical page mapcount.
*/
+static ssize_t kpagecount_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return kpage_read(file, buf, count, ppos, KPAGE_COUNT);
+}
+
+static const struct proc_ops kpagecount_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_lseek = mem_lseek,
+ .proc_read = kpagecount_read,
+};
+
static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
{
return ((kflags >> kbit) & 1) << ubit;
}
-u64 stable_page_flags(struct page *page)
+u64 stable_page_flags(const struct page *page)
{
- u64 k;
- u64 u;
+ const struct folio *folio;
+ struct page_snapshot ps;
+ unsigned long k;
+ unsigned long mapping;
+ bool is_anon;
+ u64 u = 0;
/*
* pseudo flag: KPF_NOPAGE
@@ -96,74 +159,63 @@ u64 stable_page_flags(struct page *page)
if (!page)
return 1 << KPF_NOPAGE;
- k = page->flags;
- u = 0;
+ snapshot_page(&ps, page);
+ folio = &ps.folio_snapshot;
+
+ k = folio->flags.f;
+ mapping = (unsigned long)folio->mapping;
+ is_anon = mapping & FOLIO_MAPPING_ANON;
/*
* pseudo flags for the well known (anonymous) memory mapped pages
- *
- * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
- * simple test in page_mapped() is not enough.
*/
- if (!PageSlab(page) && page_mapped(page))
+ if (folio_mapped(folio))
u |= 1 << KPF_MMAP;
- if (PageAnon(page))
+ if (is_anon) {
u |= 1 << KPF_ANON;
- if (PageKsm(page))
- u |= 1 << KPF_KSM;
+ if (mapping & FOLIO_MAPPING_KSM)
+ u |= 1 << KPF_KSM;
+ }
/*
* compound pages: export both head/tail info
* they together define a compound page's start/end pos and order
*/
- if (PageHead(page))
- u |= 1 << KPF_COMPOUND_HEAD;
- if (PageTail(page))
+ if (ps.idx == 0)
+ u |= kpf_copy_bit(k, KPF_COMPOUND_HEAD, PG_head);
+ else
u |= 1 << KPF_COMPOUND_TAIL;
- if (PageHuge(page))
+ if (folio_test_hugetlb(folio))
u |= 1 << KPF_HUGE;
- /*
- * PageTransCompound can be true for non-huge compound pages (slab
- * pages or pages allocated by drivers with __GFP_COMP) because it
- * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
- * to make sure a given page is a thp, not a non-huge compound page.
- */
- else if (PageTransCompound(page)) {
- struct page *head = compound_head(page);
-
- if (PageLRU(head) || PageAnon(head))
- u |= 1 << KPF_THP;
- else if (is_huge_zero_page(head)) {
- u |= 1 << KPF_ZERO_PAGE;
- u |= 1 << KPF_THP;
- }
- } else if (is_zero_pfn(page_to_pfn(page)))
+ else if (folio_test_large(folio) &&
+ folio_test_large_rmappable(folio)) {
+ /* Note: we indicate any THPs here, not just PMD-sized ones */
+ u |= 1 << KPF_THP;
+ } else if (is_huge_zero_pfn(ps.pfn)) {
u |= 1 << KPF_ZERO_PAGE;
+ u |= 1 << KPF_THP;
+ } else if (is_zero_pfn(ps.pfn)) {
+ u |= 1 << KPF_ZERO_PAGE;
+ }
-
- /*
- * Caveats on high order pages: page->_refcount will only be set
- * -1 on the head page; SLUB/SLQB do the same for PG_slab;
- * SLOB won't set PG_slab at all on compound pages.
- */
- if (PageBuddy(page))
- u |= 1 << KPF_BUDDY;
- else if (page_count(page) == 0 && is_free_buddy_page(page))
+ if (ps.flags & PAGE_SNAPSHOT_PG_BUDDY)
u |= 1 << KPF_BUDDY;
- if (PageBalloon(page))
- u |= 1 << KPF_BALLOON;
+ if (folio_test_offline(folio))
+ u |= 1 << KPF_OFFLINE;
+ if (folio_test_pgtable(folio))
+ u |= 1 << KPF_PGTABLE;
+ if (folio_test_slab(folio))
+ u |= 1 << KPF_SLAB;
- if (page_is_idle(page))
+#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
+ u |= kpf_copy_bit(k, KPF_IDLE, PG_idle);
+#else
+ if (ps.flags & PAGE_SNAPSHOT_PG_IDLE)
u |= 1 << KPF_IDLE;
+#endif
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
-
- u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
- if (PageTail(page) && PageSlab(compound_head(page)))
- u |= 1 << KPF_SLAB;
-
- u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate);
u |= kpf_copy_bit(k, KPF_WRITEBACK, PG_writeback);
@@ -173,7 +225,8 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active);
u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim);
- if (PageSwapCache(page))
+#define SWAPCACHE ((1 << PG_swapbacked) | (1 << PG_swapcache))
+ if ((k & SWAPCACHE) == SWAPCACHE)
u |= 1 << KPF_SWAPCACHE;
u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked);
@@ -181,123 +234,65 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked);
#ifdef CONFIG_MEMORY_FAILURE
- u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
-#endif
-
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
- u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached);
+ if (u & (1 << KPF_HUGE))
+ u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
+ else
+ u |= kpf_copy_bit(ps.page_snapshot.flags.f, KPF_HWPOISON, PG_hwpoison);
#endif
u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved);
- u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk);
+ u |= kpf_copy_bit(k, KPF_OWNER_2, PG_owner_2);
u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private);
u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2);
u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1);
+#ifdef CONFIG_ARCH_USES_PG_ARCH_2
+ u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2);
+#endif
+#ifdef CONFIG_ARCH_USES_PG_ARCH_3
+ u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3);
+#endif
return u;
-};
+}
+EXPORT_SYMBOL_GPL(stable_page_flags);
+/* /proc/kpageflags - an array exposing page flags
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page flags.
+ */
static ssize_t kpageflags_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+ size_t count, loff_t *ppos)
{
- u64 __user *out = (u64 __user *)buf;
- struct page *ppage;
- unsigned long src = *ppos;
- unsigned long pfn;
- ssize_t ret = 0;
-
- pfn = src / KPMSIZE;
- count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
- if (src & KPMMASK || count & KPMMASK)
- return -EINVAL;
-
- while (count > 0) {
- if (pfn_valid(pfn))
- ppage = pfn_to_page(pfn);
- else
- ppage = NULL;
-
- if (put_user(stable_page_flags(ppage), out)) {
- ret = -EFAULT;
- break;
- }
-
- pfn++;
- out++;
- count -= KPMSIZE;
-
- cond_resched();
- }
-
- *ppos += (char __user *)out - buf;
- if (!ret)
- ret = (char __user *)out - buf;
- return ret;
+ return kpage_read(file, buf, count, ppos, KPAGE_FLAGS);
}
-static const struct file_operations proc_kpageflags_operations = {
- .llseek = mem_lseek,
- .read = kpageflags_read,
+static const struct proc_ops kpageflags_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_lseek = mem_lseek,
+ .proc_read = kpageflags_read,
};
#ifdef CONFIG_MEMCG
static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+ size_t count, loff_t *ppos)
{
- u64 __user *out = (u64 __user *)buf;
- struct page *ppage;
- unsigned long src = *ppos;
- unsigned long pfn;
- ssize_t ret = 0;
- u64 ino;
-
- pfn = src / KPMSIZE;
- count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
- if (src & KPMMASK || count & KPMMASK)
- return -EINVAL;
-
- while (count > 0) {
- if (pfn_valid(pfn))
- ppage = pfn_to_page(pfn);
- else
- ppage = NULL;
-
- if (ppage)
- ino = page_cgroup_ino(ppage);
- else
- ino = 0;
-
- if (put_user(ino, out)) {
- ret = -EFAULT;
- break;
- }
-
- pfn++;
- out++;
- count -= KPMSIZE;
-
- cond_resched();
- }
-
- *ppos += (char __user *)out - buf;
- if (!ret)
- ret = (char __user *)out - buf;
- return ret;
+ return kpage_read(file, buf, count, ppos, KPAGE_CGROUP);
}
-
-static const struct file_operations proc_kpagecgroup_operations = {
- .llseek = mem_lseek,
- .read = kpagecgroup_read,
+static const struct proc_ops kpagecgroup_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_lseek = mem_lseek,
+ .proc_read = kpagecgroup_read,
};
#endif /* CONFIG_MEMCG */
static int __init proc_page_init(void)
{
- proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
- proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
+ proc_create("kpagecount", S_IRUSR, NULL, &kpagecount_proc_ops);
+ proc_create("kpageflags", S_IRUSR, NULL, &kpageflags_proc_ops);
#ifdef CONFIG_MEMCG
- proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations);
+ proc_create("kpagecgroup", S_IRUSR, NULL, &kpagecgroup_proc_ops);
#endif
return 0;
}
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index d72fc40241d9..52f0b75cbce2 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/fs/proc/net.c
*
@@ -7,9 +8,6 @@
*
* proc net directory handling functions
*/
-
-#include <linux/uaccess.h>
-
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
@@ -38,73 +36,234 @@ static struct net *get_proc_net(const struct inode *inode)
return maybe_get_net(PDE_NET(PDE(inode)));
}
-int seq_open_net(struct inode *ino, struct file *f,
- const struct seq_operations *ops, int size)
+static int seq_open_net(struct inode *inode, struct file *file)
{
- struct net *net;
+ unsigned int state_size = PDE(inode)->state_size;
struct seq_net_private *p;
+ struct net *net;
- BUG_ON(size < sizeof(*p));
+ WARN_ON_ONCE(state_size < sizeof(*p));
- net = get_proc_net(ino);
- if (net == NULL)
+ if (file->f_mode & FMODE_WRITE && !PDE(inode)->write)
+ return -EACCES;
+
+ net = get_proc_net(inode);
+ if (!net)
return -ENXIO;
- p = __seq_open_private(f, ops, size);
- if (p == NULL) {
+ p = __seq_open_private(file, PDE(inode)->seq_ops, state_size);
+ if (!p) {
put_net(net);
return -ENOMEM;
}
#ifdef CONFIG_NET_NS
p->net = net;
+ netns_tracker_alloc(net, &p->ns_tracker, GFP_KERNEL);
#endif
return 0;
}
-EXPORT_SYMBOL_GPL(seq_open_net);
-int single_open_net(struct inode *inode, struct file *file,
- int (*show)(struct seq_file *, void *))
+static void seq_file_net_put_net(struct seq_file *seq)
{
- int err;
- struct net *net;
+#ifdef CONFIG_NET_NS
+ struct seq_net_private *priv = seq->private;
- err = -ENXIO;
- net = get_proc_net(inode);
- if (net == NULL)
- goto err_net;
+ put_net_track(priv->net, &priv->ns_tracker);
+#else
+ put_net(&init_net);
+#endif
+}
- err = single_open(file, show, net);
- if (err < 0)
- goto err_open;
+static int seq_release_net(struct inode *ino, struct file *f)
+{
+ struct seq_file *seq = f->private_data;
+ seq_file_net_put_net(seq);
+ seq_release_private(ino, f);
return 0;
+}
-err_open:
- put_net(net);
-err_net:
- return err;
+static const struct proc_ops proc_net_seq_ops = {
+ .proc_open = seq_open_net,
+ .proc_read = seq_read,
+ .proc_write = proc_simple_write,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release_net,
+};
+
+int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+#ifdef CONFIG_NET_NS
+ struct seq_net_private *p = priv_data;
+
+ p->net = get_net_track(current->nsproxy->net_ns, &p->ns_tracker,
+ GFP_KERNEL);
+#endif
+ return 0;
}
-EXPORT_SYMBOL_GPL(single_open_net);
-int seq_release_net(struct inode *ino, struct file *f)
+void bpf_iter_fini_seq_net(void *priv_data)
{
- struct seq_file *seq;
+#ifdef CONFIG_NET_NS
+ struct seq_net_private *p = priv_data;
- seq = f->private_data;
+ put_net_track(p->net, &p->ns_tracker);
+#endif
+}
- put_net(seq_file_net(seq));
- seq_release_private(ino, f);
- return 0;
+struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
+ struct proc_dir_entry *parent, const struct seq_operations *ops,
+ unsigned int state_size, void *data)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ pde_force_lookup(p);
+ p->proc_ops = &proc_net_seq_ops;
+ p->seq_ops = ops;
+ p->state_size = state_size;
+ return proc_register(parent, p);
+}
+EXPORT_SYMBOL_GPL(proc_create_net_data);
+
+/**
+ * proc_create_net_data_write - Create a writable net_ns-specific proc file
+ * @name: The name of the file.
+ * @mode: The file's access mode.
+ * @parent: The parent directory in which to create.
+ * @ops: The seq_file ops with which to read the file.
+ * @write: The write method with which to 'modify' the file.
+ * @state_size: The size of the per-file private state to allocate.
+ * @data: Data for retrieval by pde_data().
+ *
+ * Create a network namespaced proc file in the @parent directory with the
+ * specified @name and @mode that allows reading of a file that displays a
+ * series of elements and also provides for the file accepting writes that have
+ * some arbitrary effect.
+ *
+ * The functions in the @ops table are used to iterate over items to be
+ * presented and extract the readable content using the seq_file interface.
+ *
+ * The @write function is called with the data copied into a kernel space
+ * scratch buffer and has a NUL appended for convenience. The buffer may be
+ * modified by the @write function. @write should return 0 on success.
+ *
+ * The @data value is accessible from the @show and @write functions by calling
+ * pde_data() on the file inode. The network namespace must be accessed by
+ * calling seq_file_net() on the seq_file struct.
+ */
+struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode,
+ struct proc_dir_entry *parent,
+ const struct seq_operations *ops,
+ proc_write_t write,
+ unsigned int state_size, void *data)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ pde_force_lookup(p);
+ p->proc_ops = &proc_net_seq_ops;
+ p->seq_ops = ops;
+ p->state_size = state_size;
+ p->write = write;
+ return proc_register(parent, p);
+}
+EXPORT_SYMBOL_GPL(proc_create_net_data_write);
+
+static int single_open_net(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *de = PDE(inode);
+ struct net *net;
+ int err;
+
+ net = get_proc_net(inode);
+ if (!net)
+ return -ENXIO;
+
+ err = single_open(file, de->single_show, net);
+ if (err)
+ put_net(net);
+ return err;
}
-EXPORT_SYMBOL_GPL(seq_release_net);
-int single_release_net(struct inode *ino, struct file *f)
+static int single_release_net(struct inode *ino, struct file *f)
{
struct seq_file *seq = f->private_data;
put_net(seq->private);
return single_release(ino, f);
}
-EXPORT_SYMBOL_GPL(single_release_net);
+
+static const struct proc_ops proc_net_single_ops = {
+ .proc_open = single_open_net,
+ .proc_read = seq_read,
+ .proc_write = proc_simple_write,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release_net,
+};
+
+struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
+ struct proc_dir_entry *parent,
+ int (*show)(struct seq_file *, void *), void *data)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ pde_force_lookup(p);
+ p->proc_ops = &proc_net_single_ops;
+ p->single_show = show;
+ return proc_register(parent, p);
+}
+EXPORT_SYMBOL_GPL(proc_create_net_single);
+
+/**
+ * proc_create_net_single_write - Create a writable net_ns-specific proc file
+ * @name: The name of the file.
+ * @mode: The file's access mode.
+ * @parent: The parent directory in which to create.
+ * @show: The seqfile show method with which to read the file.
+ * @write: The write method with which to 'modify' the file.
+ * @data: Data for retrieval by pde_data().
+ *
+ * Create a network-namespaced proc file in the @parent directory with the
+ * specified @name and @mode that allows reading of a file that displays a
+ * single element rather than a series and also provides for the file accepting
+ * writes that have some arbitrary effect.
+ *
+ * The @show function is called to extract the readable content via the
+ * seq_file interface.
+ *
+ * The @write function is called with the data copied into a kernel space
+ * scratch buffer and has a NUL appended for convenience. The buffer may be
+ * modified by the @write function. @write should return 0 on success.
+ *
+ * The @data value is accessible from the @show and @write functions by calling
+ * pde_data() on the file inode. The network namespace must be accessed by
+ * calling seq_file_single_net() on the seq_file struct.
+ */
+struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode,
+ struct proc_dir_entry *parent,
+ int (*show)(struct seq_file *, void *),
+ proc_write_t write,
+ void *data)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ pde_force_lookup(p);
+ p->proc_ops = &proc_net_single_ops;
+ p->single_show = show;
+ p->write = write;
+ return proc_register(parent, p);
+}
+EXPORT_SYMBOL_GPL(proc_create_net_single_write);
static struct net *get_proc_task_net(struct inode *dir)
{
@@ -135,13 +294,14 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir,
de = ERR_PTR(-ENOENT);
net = get_proc_task_net(dir);
if (net != NULL) {
- de = proc_lookup_de(net->proc_net, dir, dentry);
+ de = proc_lookup_de(dir, dentry, net->proc_net);
put_net(net);
}
return de;
}
-static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat,
+static int proc_tgid_net_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
@@ -149,7 +309,7 @@ static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat,
net = get_proc_task_net(inode);
- generic_fillattr(inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (net != NULL) {
stat->nlink = net->proc_net->nlink;
@@ -162,6 +322,7 @@ static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat,
const struct inode_operations proc_net_inode_operations = {
.lookup = proc_tgid_net_lookup,
.getattr = proc_tgid_net_getattr,
+ .setattr = proc_setattr,
};
static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
@@ -172,7 +333,7 @@ static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
ret = -EINVAL;
net = get_proc_task_net(file_inode(file));
if (net != NULL) {
- ret = proc_readdir_de(net->proc_net, file, ctx);
+ ret = proc_readdir_de(file, ctx, net->proc_net);
put_net(net);
}
return ret;
@@ -191,8 +352,14 @@ static __net_init int proc_net_ns_init(struct net *net)
kgid_t gid;
int err;
+ /*
+ * This PDE acts only as an anchor for /proc/${pid}/net hierarchy.
+ * Corresponding inode (PDE(inode) == net->proc_net) is never
+ * instantiated therefore blanket zeroing is fine.
+ * net->proc_net_stat inode is instantiated normally.
+ */
err = -ENOMEM;
- netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL);
+ netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
if (!netd)
goto out;
@@ -201,6 +368,7 @@ static __net_init int proc_net_ns_init(struct net *net)
netd->nlink = 2;
netd->namelen = 3;
netd->parent = &proc_root;
+ netd->name = netd->inline_name;
memcpy(netd->name, "net", 4);
uid = make_kuid(net->user_ns, 0);
@@ -213,6 +381,9 @@ static __net_init int proc_net_ns_init(struct net *net)
proc_set_user(netd, uid, gid);
+ /* Seed dentry revalidation for /proc/${pid}/net */
+ pde_force_lookup(netd);
+
err = -EEXIST;
net_statd = proc_net_mkdir(net, "stat", netd);
if (!net_statd)
@@ -223,7 +394,7 @@ static __net_init int proc_net_ns_init(struct net *net)
return 0;
free_net:
- kfree(netd);
+ pde_free(netd);
out:
return err;
}
@@ -231,7 +402,7 @@ out:
static __net_exit void proc_net_ns_exit(struct net *net)
{
remove_proc_entry("stat", net->proc_net);
- kfree(net->proc_net);
+ pde_free(net->proc_net);
}
static struct pernet_operations __net_initdata proc_net_ns_ops = {
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 8f479229b349..49ab74e0bfde 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* /proc/sys support
*/
@@ -11,36 +12,52 @@
#include <linux/cred.h>
#include <linux/namei.h>
#include <linux/mm.h>
+#include <linux/uio.h>
#include <linux/module.h>
+#include <linux/bpf-cgroup.h>
+#include <linux/mount.h>
+#include <linux/kmemleak.h>
+#include <linux/lockdep.h>
#include "internal.h"
+#define list_for_each_table_entry(entry, header) \
+ entry = header->ctl_table; \
+ for (size_t i = 0 ; i < header->ctl_table_size; ++i, entry++)
+
static const struct dentry_operations proc_sys_dentry_operations;
static const struct file_operations proc_sys_file_operations;
static const struct inode_operations proc_sys_inode_operations;
static const struct file_operations proc_sys_dir_file_operations;
static const struct inode_operations proc_sys_dir_operations;
-/* Support for permanently empty directories */
-
-struct ctl_table sysctl_mount_point[] = {
+/*
+ * Support for permanently empty directories.
+ * Must be non-empty to avoid sharing an address with other tables.
+ */
+static const struct ctl_table sysctl_mount_point[] = {
{ }
};
-static bool is_empty_dir(struct ctl_table_header *head)
-{
- return head->ctl_table[0].child == sysctl_mount_point;
-}
-
-static void set_empty_dir(struct ctl_dir *dir)
+/**
+ * register_sysctl_mount_point() - registers a sysctl mount point
+ * @path: path for the mount point
+ *
+ * Used to create a permanently empty directory to serve as mount point.
+ * There are some subtle but important permission checks this allows in the
+ * case of unprivileged mounts.
+ */
+struct ctl_table_header *register_sysctl_mount_point(const char *path)
{
- dir->header.ctl_table[0].child = sysctl_mount_point;
+ return register_sysctl_sz(path, sysctl_mount_point, 0);
}
+EXPORT_SYMBOL(register_sysctl_mount_point);
-static void clear_empty_dir(struct ctl_dir *dir)
-
-{
- dir->header.ctl_table[0].child = NULL;
-}
+#define sysctl_is_perm_empty_ctl_header(hptr) \
+ (hptr->type == SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY)
+#define sysctl_set_perm_empty_ctl_header(hptr) \
+ (hptr->type = SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY)
+#define sysctl_clear_perm_empty_ctl_header(hptr) \
+ (hptr->type = SYSCTL_TABLE_TYPE_DEFAULT)
void proc_sys_poll_notify(struct ctl_table_poll *poll)
{
@@ -51,12 +68,11 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll)
wake_up_interruptible(&poll->wait);
}
-static struct ctl_table root_table[] = {
+static const struct ctl_table root_table[] = {
{
.procname = "",
.mode = S_IFDIR|S_IRUGO|S_IXUGO,
},
- { }
};
static struct ctl_table_root sysctl_table_root = {
.default_set.dir.header = {
@@ -73,7 +89,7 @@ static DEFINE_SPINLOCK(sysctl_lock);
static void drop_sysctl_table(struct ctl_table_header *header);
static int sysctl_follow_link(struct ctl_table_header **phead,
- struct ctl_table **pentry);
+ const struct ctl_table **pentry);
static int insert_links(struct ctl_table_header *head);
static void put_links(struct ctl_table_header *header);
@@ -86,27 +102,23 @@ static void sysctl_print_dir(struct ctl_dir *dir)
static int namecmp(const char *name1, int len1, const char *name2, int len2)
{
- int minlen;
int cmp;
- minlen = len1;
- if (minlen > len2)
- minlen = len2;
-
- cmp = memcmp(name1, name2, minlen);
+ cmp = memcmp(name1, name2, min(len1, len2));
if (cmp == 0)
cmp = len1 - len2;
return cmp;
}
-/* Called under sysctl_lock */
-static struct ctl_table *find_entry(struct ctl_table_header **phead,
+static const struct ctl_table *find_entry(struct ctl_table_header **phead,
struct ctl_dir *dir, const char *name, int namelen)
{
struct ctl_table_header *head;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
struct rb_node *node = dir->root.rb_node;
+ lockdep_assert_held(&sysctl_lock);
+
while (node)
{
struct ctl_node *ctl_node;
@@ -131,7 +143,7 @@ static struct ctl_table *find_entry(struct ctl_table_header **phead,
return NULL;
}
-static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
+static int insert_entry(struct ctl_table_header *head, const struct ctl_table *entry)
{
struct rb_node *node = &head->node[entry - head->ctl_table].node;
struct rb_node **p = &head->parent->root.rb_node;
@@ -141,7 +153,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
while (*p) {
struct ctl_table_header *parent_head;
- struct ctl_table *parent_entry;
+ const struct ctl_table *parent_entry;
struct ctl_node *parent_node;
const char *parent_name;
int cmp;
@@ -160,7 +172,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
else {
pr_err("sysctl duplicate entry: ");
sysctl_print_dir(head->parent);
- pr_cont("/%s\n", entry->procname);
+ pr_cont("%s\n", entry->procname);
return -EEXIST;
}
}
@@ -170,7 +182,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
return 0;
}
-static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
+static void erase_entry(struct ctl_table_header *head, const struct ctl_table *entry)
{
struct rb_node *node = &head->node[entry - head->ctl_table].node;
@@ -179,9 +191,10 @@ static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
static void init_header(struct ctl_table_header *head,
struct ctl_table_root *root, struct ctl_table_set *set,
- struct ctl_node *node, struct ctl_table *table)
+ struct ctl_node *node, const struct ctl_table *table, size_t table_size)
{
head->ctl_table = table;
+ head->ctl_table_size = table_size;
head->ctl_table_arg = table;
head->used = 0;
head->count = 1;
@@ -193,41 +206,49 @@ static void init_header(struct ctl_table_header *head,
head->node = node;
INIT_HLIST_HEAD(&head->inodes);
if (node) {
- struct ctl_table *entry;
- for (entry = table; entry->procname; entry++, node++)
+ const struct ctl_table *entry;
+
+ list_for_each_table_entry(entry, head) {
node->header = head;
+ node++;
+ }
}
+ if (table == sysctl_mount_point)
+ sysctl_set_perm_empty_ctl_header(head);
}
static void erase_header(struct ctl_table_header *head)
{
- struct ctl_table *entry;
- for (entry = head->ctl_table; entry->procname; entry++)
+ const struct ctl_table *entry;
+
+ list_for_each_table_entry(entry, head)
erase_entry(head, entry);
}
static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
{
- struct ctl_table *entry;
+ const struct ctl_table *entry;
+ struct ctl_table_header *dir_h = &dir->header;
int err;
+
/* Is this a permanently empty directory? */
- if (is_empty_dir(&dir->header))
+ if (sysctl_is_perm_empty_ctl_header(dir_h))
return -EROFS;
/* Am I creating a permanently empty directory? */
- if (header->ctl_table == sysctl_mount_point) {
+ if (sysctl_is_perm_empty_ctl_header(header)) {
if (!RB_EMPTY_ROOT(&dir->root))
return -EINVAL;
- set_empty_dir(dir);
+ sysctl_set_perm_empty_ctl_header(dir_h);
}
- dir->header.nreg++;
+ dir_h->nreg++;
header->parent = dir;
err = insert_links(header);
if (err)
goto fail_links;
- for (entry = header->ctl_table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, header) {
err = insert_entry(header, entry);
if (err)
goto fail;
@@ -238,70 +259,41 @@ fail:
put_links(header);
fail_links:
if (header->ctl_table == sysctl_mount_point)
- clear_empty_dir(dir);
+ sysctl_clear_perm_empty_ctl_header(dir_h);
header->parent = NULL;
- drop_sysctl_table(&dir->header);
+ drop_sysctl_table(dir_h);
return err;
}
-/* called under sysctl_lock */
static int use_table(struct ctl_table_header *p)
{
+ lockdep_assert_held(&sysctl_lock);
+
if (unlikely(p->unregistering))
return 0;
p->used++;
return 1;
}
-/* called under sysctl_lock */
static void unuse_table(struct ctl_table_header *p)
{
+ lockdep_assert_held(&sysctl_lock);
+
if (!--p->used)
if (unlikely(p->unregistering))
complete(p->unregistering);
}
-static void proc_sys_prune_dcache(struct ctl_table_header *head)
+static void proc_sys_invalidate_dcache(struct ctl_table_header *head)
{
- struct inode *inode;
- struct proc_inode *ei;
- struct hlist_node *node;
- struct super_block *sb;
-
- rcu_read_lock();
- for (;;) {
- node = hlist_first_rcu(&head->inodes);
- if (!node)
- break;
- ei = hlist_entry(node, struct proc_inode, sysctl_inodes);
- spin_lock(&sysctl_lock);
- hlist_del_init_rcu(&ei->sysctl_inodes);
- spin_unlock(&sysctl_lock);
-
- inode = &ei->vfs_inode;
- sb = inode->i_sb;
- if (!atomic_inc_not_zero(&sb->s_active))
- continue;
- inode = igrab(inode);
- rcu_read_unlock();
- if (unlikely(!inode)) {
- deactivate_super(sb);
- rcu_read_lock();
- continue;
- }
-
- d_prune_aliases(inode);
- iput(inode);
- deactivate_super(sb);
-
- rcu_read_lock();
- }
- rcu_read_unlock();
+ proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock);
}
-/* called under sysctl_lock, will reacquire if has to wait */
static void start_unregistering(struct ctl_table_header *p)
{
+ /* will reacquire if has to wait */
+ lockdep_assert_held(&sysctl_lock);
+
/*
* if p->used is 0, nobody will ever touch that entry again;
* we'll eliminate all paths to it before dropping sysctl_lock
@@ -318,10 +310,10 @@ static void start_unregistering(struct ctl_table_header *p)
spin_unlock(&sysctl_lock);
}
/*
- * Prune dentries for unregistered sysctls: namespaced sysctls
+ * Invalidate dentries for unregistered sysctls: namespaced sysctls
* can have duplicate names and contaminate dcache very badly.
*/
- proc_sys_prune_dcache(p);
+ proc_sys_invalidate_dcache(p);
/*
* do not remove from the list until nobody holds it; walking the
* list in do_sysctl() relies on that.
@@ -358,12 +350,12 @@ lookup_header_set(struct ctl_table_root *root)
return set;
}
-static struct ctl_table *lookup_entry(struct ctl_table_header **phead,
- struct ctl_dir *dir,
- const char *name, int namelen)
+static const struct ctl_table *lookup_entry(struct ctl_table_header **phead,
+ struct ctl_dir *dir,
+ const char *name, int namelen)
{
struct ctl_table_header *head;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
spin_lock(&sysctl_lock);
entry = find_entry(&head, dir, name, namelen);
@@ -388,10 +380,10 @@ static struct ctl_node *first_usable_entry(struct rb_node *node)
}
static void first_entry(struct ctl_dir *dir,
- struct ctl_table_header **phead, struct ctl_table **pentry)
+ struct ctl_table_header **phead, const struct ctl_table **pentry)
{
struct ctl_table_header *head = NULL;
- struct ctl_table *entry = NULL;
+ const struct ctl_table *entry = NULL;
struct ctl_node *ctl_node;
spin_lock(&sysctl_lock);
@@ -405,10 +397,10 @@ static void first_entry(struct ctl_dir *dir,
*pentry = entry;
}
-static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry)
+static void next_entry(struct ctl_table_header **phead, const struct ctl_table **pentry)
{
struct ctl_table_header *head = *phead;
- struct ctl_table *entry = *pentry;
+ const struct ctl_table *entry = *pentry;
struct ctl_node *ctl_node = &head->node[entry - head->ctl_table];
spin_lock(&sysctl_lock);
@@ -441,7 +433,7 @@ static int test_perm(int mode, int op)
return -EACCES;
}
-static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op)
+static int sysctl_perm(struct ctl_table_header *head, const struct ctl_table *table, int op)
{
struct ctl_table_root *root = head->root;
int mode;
@@ -455,7 +447,7 @@ static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, i
}
static struct inode *proc_sys_make_inode(struct super_block *sb,
- struct ctl_table_header *head, struct ctl_table *table)
+ struct ctl_table_header *head, const struct ctl_table *table)
{
struct ctl_table_root *root = head->root;
struct inode *inode;
@@ -463,7 +455,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
inode = new_inode(sb);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOMEM);
inode->i_ino = get_next_ino();
@@ -473,16 +465,15 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
if (unlikely(head->unregistering)) {
spin_unlock(&sysctl_lock);
iput(inode);
- inode = NULL;
- goto out;
+ return ERR_PTR(-ENOENT);
}
ei->sysctl = head;
ei->sysctl_entry = table;
- hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes);
+ hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes);
head->count++;
spin_unlock(&sysctl_lock);
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ simple_inode_init_ts(inode);
inode->i_mode = table->mode;
if (!S_ISDIR(table->mode)) {
inode->i_mode |= S_IFREG;
@@ -492,21 +483,22 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
inode->i_mode |= S_IFDIR;
inode->i_op = &proc_sys_dir_operations;
inode->i_fop = &proc_sys_dir_file_operations;
- if (is_empty_dir(head))
+ if (sysctl_is_perm_empty_ctl_header(head))
make_empty_dir_inode(inode);
}
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
if (root->set_ownership)
- root->set_ownership(head, table, &inode->i_uid, &inode->i_gid);
+ root->set_ownership(head, &inode->i_uid, &inode->i_gid);
-out:
return inode;
}
void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
{
spin_lock(&sysctl_lock);
- hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes);
+ hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes);
if (!--head->count)
kfree_rcu(head, rcu);
spin_unlock(&sysctl_lock);
@@ -526,7 +518,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
struct ctl_table_header *head = grab_header(dir);
struct ctl_table_header *h = NULL;
const struct qstr *name = &dentry->d_name;
- struct ctl_table *p;
+ const struct ctl_table *p;
struct inode *inode;
struct dentry *err = ERR_PTR(-ENOENT);
struct ctl_dir *ctl_dir;
@@ -548,14 +540,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
- err = ERR_PTR(-ENOMEM);
inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
- if (!inode)
- goto out;
-
- err = NULL;
- d_set_d_op(dentry, &proc_sys_dentry_operations);
- d_add(dentry, inode);
+ err = d_splice_alias_ops(inode, dentry, &proc_sys_dentry_operations);
out:
if (h)
@@ -564,14 +550,15 @@ out:
return err;
}
-static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
- size_t count, loff_t *ppos, int write)
+static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
+ int write)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(iocb->ki_filp);
struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ size_t count = iov_iter_count(iter);
+ char *kbuf;
ssize_t error;
- size_t res;
if (IS_ERR(head))
return PTR_ERR(head);
@@ -589,33 +576,60 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
if (!table->proc_handler)
goto out;
+ /* don't even try if the size is too large */
+ error = -ENOMEM;
+ if (count >= KMALLOC_MAX_SIZE)
+ goto out;
+ kbuf = kvzalloc(count + 1, GFP_KERNEL);
+ if (!kbuf)
+ goto out;
+
+ if (write) {
+ error = -EFAULT;
+ if (!copy_from_iter_full(kbuf, count, iter))
+ goto out_free_buf;
+ kbuf[count] = '\0';
+ }
+
+ error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count,
+ &iocb->ki_pos);
+ if (error)
+ goto out_free_buf;
+
/* careful: calling conventions are nasty here */
- res = count;
- error = table->proc_handler(table, write, buf, &res, ppos);
- if (!error)
- error = res;
+ error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos);
+ if (error)
+ goto out_free_buf;
+
+ if (!write) {
+ error = -EFAULT;
+ if (copy_to_iter(kbuf, count, iter) < count)
+ goto out_free_buf;
+ }
+
+ error = count;
+out_free_buf:
+ kvfree(kbuf);
out:
sysctl_head_finish(head);
return error;
}
-static ssize_t proc_sys_read(struct file *filp, char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter)
{
- return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0);
+ return proc_sys_call_handler(iocb, iter, 0);
}
-static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter)
{
- return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
+ return proc_sys_call_handler(iocb, iter, 1);
}
static int proc_sys_open(struct inode *inode, struct file *filp)
{
struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
/* sysctl was unregistered */
if (IS_ERR(head))
@@ -629,17 +643,17 @@ static int proc_sys_open(struct inode *inode, struct file *filp)
return 0;
}
-static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
+static __poll_t proc_sys_poll(struct file *filp, poll_table *wait)
{
struct inode *inode = file_inode(filp);
struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
- unsigned int ret = DEFAULT_POLLMASK;
+ const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ __poll_t ret = DEFAULT_POLLMASK;
unsigned long event;
/* sysctl was unregistered */
if (IS_ERR(head))
- return POLLERR | POLLHUP;
+ return EPOLLERR | EPOLLHUP;
if (!table->proc_handler)
goto out;
@@ -652,7 +666,7 @@ static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
if (event != atomic_read(&table->poll->event)) {
filp->private_data = proc_sys_poll_event(table->poll);
- ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+ ret = EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
}
out:
@@ -664,7 +678,7 @@ out:
static bool proc_sys_fill_cache(struct file *file,
struct dir_context *ctx,
struct ctl_table_header *head,
- struct ctl_table *table)
+ const struct ctl_table *table)
{
struct dentry *child, *dir = file->f_path.dentry;
struct inode *inode;
@@ -683,14 +697,19 @@ static bool proc_sys_fill_cache(struct file *file,
if (IS_ERR(child))
return false;
if (d_in_lookup(child)) {
+ struct dentry *res;
inode = proc_sys_make_inode(dir->d_sb, head, table);
- if (!inode) {
- d_lookup_done(child);
+ res = d_splice_alias_ops(inode, child,
+ &proc_sys_dentry_operations);
+ d_lookup_done(child);
+ if (unlikely(res)) {
dput(child);
- return false;
+
+ if (IS_ERR(res))
+ return false;
+
+ child = res;
}
- d_set_d_op(child, &proc_sys_dentry_operations);
- d_add(child, inode);
}
}
inode = d_inode(child);
@@ -703,17 +722,17 @@ static bool proc_sys_fill_cache(struct file *file,
static bool proc_sys_link_fill_cache(struct file *file,
struct dir_context *ctx,
struct ctl_table_header *head,
- struct ctl_table *table)
+ const struct ctl_table *table)
{
bool ret = true;
+
head = sysctl_head_grab(head);
+ if (IS_ERR(head))
+ return false;
- if (S_ISLNK(table->mode)) {
- /* It is not an error if we can not follow the link ignore it */
- int err = sysctl_follow_link(&head, &table);
- if (err)
- goto out;
- }
+ /* It is not an error if we can not follow the link ignore it */
+ if (sysctl_follow_link(&head, &table))
+ goto out;
ret = proc_sys_fill_cache(file, ctx, head, table);
out:
@@ -721,7 +740,7 @@ out:
return ret;
}
-static int scan(struct ctl_table_header *head, struct ctl_table *table,
+static int scan(struct ctl_table_header *head, const struct ctl_table *table,
unsigned long *pos, struct file *file,
struct dir_context *ctx)
{
@@ -745,7 +764,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
{
struct ctl_table_header *head = grab_header(file_inode(file));
struct ctl_table_header *h = NULL;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
struct ctl_dir *ctl_dir;
unsigned long pos;
@@ -770,14 +789,15 @@ out:
return 0;
}
-static int proc_sys_permission(struct inode *inode, int mask)
+static int proc_sys_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask)
{
/*
* sysctl entries that are not writeable,
* are _NOT_ writeable, capabilities or not.
*/
struct ctl_table_header *head;
- struct ctl_table *table;
+ const struct ctl_table *table;
int error;
/* Executable files are not allowed under /proc/sys/ */
@@ -798,7 +818,8 @@ static int proc_sys_permission(struct inode *inode, int mask)
return error;
}
-static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
+static int proc_sys_setattr(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
int error;
@@ -806,26 +827,26 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
return -EPERM;
- error = setattr_prepare(dentry, attr);
+ error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
if (error)
return error;
- setattr_copy(inode, attr);
- mark_inode_dirty(inode);
+ setattr_copy(&nop_mnt_idmap, inode, attr);
return 0;
}
-static int proc_sys_getattr(const struct path *path, struct kstat *stat,
+static int proc_sys_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
if (IS_ERR(head))
return PTR_ERR(head);
- generic_fillattr(inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (table)
stat->mode = (stat->mode & S_IFMT) | table->mode;
@@ -836,8 +857,10 @@ static int proc_sys_getattr(const struct path *path, struct kstat *stat,
static const struct file_operations proc_sys_file_operations = {
.open = proc_sys_open,
.poll = proc_sys_poll,
- .read = proc_sys_read,
- .write = proc_sys_write,
+ .read_iter = proc_sys_read,
+ .write_iter = proc_sys_write,
+ .splice_read = copy_splice_read,
+ .splice_write = iter_file_splice_write,
.llseek = default_llseek,
};
@@ -860,7 +883,8 @@ static const struct inode_operations proc_sys_dir_operations = {
.getattr = proc_sys_getattr,
};
-static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags)
+static int proc_sys_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -893,17 +917,21 @@ static int proc_sys_compare(const struct dentry *dentry,
struct ctl_table_header *head;
struct inode *inode;
- /* Although proc doesn't have negative dentries, rcu-walk means
- * that inode here can be NULL */
- /* AV: can it, indeed? */
- inode = d_inode_rcu(dentry);
- if (!inode)
- return 1;
if (name->len != len)
return 1;
if (memcmp(name->name, str, len))
return 1;
- head = rcu_dereference(PROC_I(inode)->sysctl);
+
+ // false positive is fine here - we'll recheck anyway
+ if (d_in_lookup(dentry))
+ return 0;
+
+ inode = d_inode_rcu(dentry);
+ // we just might have run into dentry in the middle of __dentry_kill()
+ if (!inode)
+ return 1;
+
+ head = READ_ONCE(PROC_I(inode)->sysctl);
return !head || !sysctl_is_seen(head);
}
@@ -917,7 +945,7 @@ static struct ctl_dir *find_subdir(struct ctl_dir *dir,
const char *name, int namelen)
{
struct ctl_table_header *head;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
entry = find_entry(&head, dir, name, namelen);
if (!entry)
@@ -936,19 +964,18 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set,
char *new_name;
new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) +
- sizeof(struct ctl_table)*2 + namelen + 1,
+ sizeof(struct ctl_table) + namelen + 1,
GFP_KERNEL);
if (!new)
return NULL;
node = (struct ctl_node *)(new + 1);
table = (struct ctl_table *)(node + 1);
- new_name = (char *)(table + 2);
+ new_name = (char *)(table + 1);
memcpy(new_name, name, namelen);
- new_name[namelen] = '\0';
table[0].procname = new_name;
table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
- init_header(&new->header, set->dir.header.root, set, node, table);
+ init_header(&new->header, set->dir.header.root, set, node, table, 1);
return new;
}
@@ -1005,8 +1032,8 @@ failed:
if (IS_ERR(subdir)) {
pr_err("sysctl could not get directory: ");
sysctl_print_dir(dir);
- pr_cont("/%*.*s %ld\n",
- namelen, namelen, name, PTR_ERR(subdir));
+ pr_cont("%*.*s %ld\n", namelen, namelen, name,
+ PTR_ERR(subdir));
}
drop_sysctl_table(&dir->header);
if (new)
@@ -1029,16 +1056,15 @@ static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
}
static int sysctl_follow_link(struct ctl_table_header **phead,
- struct ctl_table **pentry)
+ const struct ctl_table **pentry)
{
struct ctl_table_header *head;
+ const struct ctl_table *entry;
struct ctl_table_root *root;
struct ctl_table_set *set;
- struct ctl_table *entry;
struct ctl_dir *dir;
int ret;
- ret = 0;
spin_lock(&sysctl_lock);
root = (*pentry)->data;
set = lookup_header_set(root);
@@ -1062,7 +1088,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead,
return ret;
}
-static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
+static int sysctl_err(const char *path, const struct ctl_table *table, char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -1078,72 +1104,96 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
return -EINVAL;
}
-static int sysctl_check_table_array(const char *path, struct ctl_table *table)
+static int sysctl_check_table_array(const char *path, const struct ctl_table *table)
{
+ unsigned int extra;
int err = 0;
if ((table->proc_handler == proc_douintvec) ||
(table->proc_handler == proc_douintvec_minmax)) {
if (table->maxlen != sizeof(unsigned int))
- err |= sysctl_err(path, table, "array now allowed");
+ err |= sysctl_err(path, table, "array not allowed");
+ }
+
+ if (table->proc_handler == proc_dou8vec_minmax) {
+ if (table->maxlen != sizeof(u8))
+ err |= sysctl_err(path, table, "array not allowed");
+
+ if (table->extra1) {
+ extra = *(unsigned int *) table->extra1;
+ if (extra > 255U)
+ err |= sysctl_err(path, table,
+ "range value too large for proc_dou8vec_minmax");
+ }
+ if (table->extra2) {
+ extra = *(unsigned int *) table->extra2;
+ if (extra > 255U)
+ err |= sysctl_err(path, table,
+ "range value too large for proc_dou8vec_minmax");
+ }
+ }
+
+ if (table->proc_handler == proc_dobool) {
+ if (table->maxlen != sizeof(bool))
+ err |= sysctl_err(path, table, "array not allowed");
}
return err;
}
-static int sysctl_check_table(const char *path, struct ctl_table *table)
+static int sysctl_check_table(const char *path, struct ctl_table_header *header)
{
+ const struct ctl_table *entry;
int err = 0;
- for (; table->procname; table++) {
- if (table->child)
- err |= sysctl_err(path, table, "Not a file");
-
- if ((table->proc_handler == proc_dostring) ||
- (table->proc_handler == proc_dointvec) ||
- (table->proc_handler == proc_douintvec) ||
- (table->proc_handler == proc_douintvec_minmax) ||
- (table->proc_handler == proc_dointvec_minmax) ||
- (table->proc_handler == proc_dointvec_jiffies) ||
- (table->proc_handler == proc_dointvec_userhz_jiffies) ||
- (table->proc_handler == proc_dointvec_ms_jiffies) ||
- (table->proc_handler == proc_doulongvec_minmax) ||
- (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
- if (!table->data)
- err |= sysctl_err(path, table, "No data");
- if (!table->maxlen)
- err |= sysctl_err(path, table, "No maxlen");
+ list_for_each_table_entry(entry, header) {
+ if (!entry->procname)
+ err |= sysctl_err(path, entry, "procname is null");
+ if ((entry->proc_handler == proc_dostring) ||
+ (entry->proc_handler == proc_dobool) ||
+ (entry->proc_handler == proc_dointvec) ||
+ (entry->proc_handler == proc_douintvec) ||
+ (entry->proc_handler == proc_douintvec_minmax) ||
+ (entry->proc_handler == proc_dointvec_minmax) ||
+ (entry->proc_handler == proc_dou8vec_minmax) ||
+ (entry->proc_handler == proc_dointvec_jiffies) ||
+ (entry->proc_handler == proc_dointvec_userhz_jiffies) ||
+ (entry->proc_handler == proc_dointvec_ms_jiffies) ||
+ (entry->proc_handler == proc_doulongvec_minmax) ||
+ (entry->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
+ if (!entry->data)
+ err |= sysctl_err(path, entry, "No data");
+ if (!entry->maxlen)
+ err |= sysctl_err(path, entry, "No maxlen");
else
- err |= sysctl_check_table_array(path, table);
+ err |= sysctl_check_table_array(path, entry);
}
- if (!table->proc_handler)
- err |= sysctl_err(path, table, "No proc_handler");
+ if (!entry->proc_handler)
+ err |= sysctl_err(path, entry, "No proc_handler");
- if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode)
- err |= sysctl_err(path, table, "bogus .mode 0%o",
- table->mode);
+ if ((entry->mode & (S_IRUGO|S_IWUGO)) != entry->mode)
+ err |= sysctl_err(path, entry, "bogus .mode 0%o",
+ entry->mode);
}
return err;
}
-static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table,
- struct ctl_table_root *link_root)
+static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_header *head)
{
- struct ctl_table *link_table, *entry, *link;
+ struct ctl_table *link_table, *link;
struct ctl_table_header *links;
+ const struct ctl_table *entry;
struct ctl_node *node;
char *link_name;
- int nr_entries, name_bytes;
+ int name_bytes;
name_bytes = 0;
- nr_entries = 0;
- for (entry = table; entry->procname; entry++) {
- nr_entries++;
+ list_for_each_table_entry(entry, head) {
name_bytes += strlen(entry->procname) + 1;
}
links = kzalloc(sizeof(struct ctl_table_header) +
- sizeof(struct ctl_node)*nr_entries +
- sizeof(struct ctl_table)*(nr_entries + 1) +
+ sizeof(struct ctl_node)*head->ctl_table_size +
+ sizeof(struct ctl_table)*head->ctl_table_size +
name_bytes,
GFP_KERNEL);
@@ -1151,33 +1201,41 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table
return NULL;
node = (struct ctl_node *)(links + 1);
- link_table = (struct ctl_table *)(node + nr_entries);
- link_name = (char *)&link_table[nr_entries + 1];
+ link_table = (struct ctl_table *)(node + head->ctl_table_size);
+ link_name = (char *)(link_table + head->ctl_table_size);
+ link = link_table;
- for (link = link_table, entry = table; entry->procname; link++, entry++) {
+ list_for_each_table_entry(entry, head) {
int len = strlen(entry->procname) + 1;
memcpy(link_name, entry->procname, len);
link->procname = link_name;
link->mode = S_IFLNK|S_IRWXUGO;
- link->data = link_root;
+ link->data = head->root;
link_name += len;
+ link++;
}
- init_header(links, dir->header.root, dir->header.set, node, link_table);
- links->nreg = nr_entries;
+ init_header(links, dir->header.root, dir->header.set, node, link_table,
+ head->ctl_table_size);
+ links->nreg = head->ctl_table_size;
return links;
}
static bool get_links(struct ctl_dir *dir,
- struct ctl_table *table, struct ctl_table_root *link_root)
+ struct ctl_table_header *header,
+ struct ctl_table_root *link_root)
{
- struct ctl_table_header *head;
- struct ctl_table *entry, *link;
+ struct ctl_table_header *tmp_head;
+ const struct ctl_table *entry, *link;
+
+ if (header->ctl_table_size == 0 ||
+ sysctl_is_perm_empty_ctl_header(header))
+ return true;
/* Are there links available for every entry in table? */
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, header) {
const char *procname = entry->procname;
- link = find_entry(&head, dir, procname, strlen(procname));
+ link = find_entry(&tmp_head, dir, procname, strlen(procname));
if (!link)
return false;
if (S_ISDIR(link->mode) && S_ISDIR(entry->mode))
@@ -1188,10 +1246,10 @@ static bool get_links(struct ctl_dir *dir,
}
/* The checks passed. Increase the registration count on the links */
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, header) {
const char *procname = entry->procname;
- link = find_entry(&head, dir, procname, strlen(procname));
- head->nreg++;
+ link = find_entry(&tmp_head, dir, procname, strlen(procname));
+ tmp_head->nreg++;
}
return true;
}
@@ -1199,7 +1257,7 @@ static bool get_links(struct ctl_dir *dir,
static int insert_links(struct ctl_table_header *head)
{
struct ctl_table_set *root_set = &sysctl_table_root.default_set;
- struct ctl_dir *core_parent = NULL;
+ struct ctl_dir *core_parent;
struct ctl_table_header *links;
int err;
@@ -1210,13 +1268,13 @@ static int insert_links(struct ctl_table_header *head)
if (IS_ERR(core_parent))
return 0;
- if (get_links(core_parent, head->ctl_table, head->root))
+ if (get_links(core_parent, head, head->root))
return 0;
core_parent->header.nreg++;
spin_unlock(&sysctl_lock);
- links = new_links(core_parent, head->ctl_table, head->root);
+ links = new_links(core_parent, head);
spin_lock(&sysctl_lock);
err = -ENOMEM;
@@ -1224,7 +1282,7 @@ static int insert_links(struct ctl_table_header *head)
goto out;
err = 0;
- if (get_links(core_parent, head->ctl_table, head->root)) {
+ if (get_links(core_parent, head, head->root)) {
kfree(links);
goto out;
}
@@ -1237,34 +1295,64 @@ out:
return err;
}
+/* Find the directory for the ctl_table. If one is not found create it. */
+static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path)
+{
+ const char *name, *nextname;
+
+ for (name = path; name; name = nextname) {
+ int namelen;
+ nextname = strchr(name, '/');
+ if (nextname) {
+ namelen = nextname - name;
+ nextname++;
+ } else {
+ namelen = strlen(name);
+ }
+ if (namelen == 0)
+ continue;
+
+ /*
+ * namelen ensures if name is "foo/bar/yay" only foo is
+ * registered first. We traverse as if using mkdir -p and
+ * return a ctl_dir for the last directory entry.
+ */
+ dir = get_subdir(dir, name, namelen);
+ if (IS_ERR(dir))
+ break;
+ }
+ return dir;
+}
+
/**
* __register_sysctl_table - register a leaf sysctl table
* @set: Sysctl tree to register on
* @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
+ *
+ * @table: the top-level table structure. This table should not be free'd
+ * after registration. So it should not be used on stack. It can either
+ * be a global or dynamically allocated by the caller and free'd later
+ * after sysctl unregistration.
+ * @table_size : The number of elements in table
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
+ * array.
*
* The members of the &struct ctl_table structure are used as follows:
- *
* procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
* enter a sysctl file
- *
- * data - a pointer to data for use by proc_handler
- *
- * maxlen - the maximum size in bytes of the data
- *
- * mode - the file permissions for the /proc/sys file
- *
- * child - must be %NULL.
- *
+ * data - a pointer to data for use by proc_handler
+ * maxlen - the maximum size in bytes of the data
+ * mode - the file permissions for the /proc/sys file
+ * type - Defines the target type (described in struct definition)
* proc_handler - the text handler routine (described below)
*
* extra1, extra2 - extra pointers usable by the proc handler routines
+ * XXX: we should eventually modify these to use long min / max [0]
+ * [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org
*
* Leaf nodes in the sysctl tree will be represented by a single file
- * under /proc; non-leaf nodes will be represented by directories.
+ * under /proc; non-leaf nodes are not allowed.
*
* There must be a proc_handler routine for any terminal nodes.
* Several default handlers are available to cover common cases -
@@ -1281,53 +1369,32 @@ out:
*/
struct ctl_table_header *__register_sysctl_table(
struct ctl_table_set *set,
- const char *path, struct ctl_table *table)
+ const char *path, const struct ctl_table *table, size_t table_size)
{
struct ctl_table_root *root = set->dir.header.root;
struct ctl_table_header *header;
- const char *name, *nextname;
struct ctl_dir *dir;
- struct ctl_table *entry;
struct ctl_node *node;
- int nr_entries = 0;
-
- for (entry = table; entry->procname; entry++)
- nr_entries++;
header = kzalloc(sizeof(struct ctl_table_header) +
- sizeof(struct ctl_node)*nr_entries, GFP_KERNEL);
+ sizeof(struct ctl_node)*table_size, GFP_KERNEL_ACCOUNT);
if (!header)
return NULL;
node = (struct ctl_node *)(header + 1);
- init_header(header, root, set, node, table);
- if (sysctl_check_table(path, table))
+ init_header(header, root, set, node, table, table_size);
+ if (sysctl_check_table(path, header))
goto fail;
spin_lock(&sysctl_lock);
dir = &set->dir;
- /* Reference moved down the diretory tree get_subdir */
+ /* Reference moved down the directory tree get_subdir */
dir->header.nreg++;
spin_unlock(&sysctl_lock);
- /* Find the directory for the ctl_table */
- for (name = path; name; name = nextname) {
- int namelen;
- nextname = strchr(name, '/');
- if (nextname) {
- namelen = nextname - name;
- nextname++;
- } else {
- namelen = strlen(name);
- }
- if (namelen == 0)
- continue;
-
- dir = get_subdir(dir, name, namelen);
- if (IS_ERR(dir))
- goto fail;
- }
-
+ dir = sysctl_mkdir_p(dir, path);
+ if (IS_ERR(dir))
+ goto fail;
spin_lock(&sysctl_lock);
if (insert_header(dir, header))
goto fail_put_dir_locked;
@@ -1342,239 +1409,67 @@ fail_put_dir_locked:
spin_unlock(&sysctl_lock);
fail:
kfree(header);
- dump_stack();
return NULL;
}
/**
- * register_sysctl - register a sysctl table
- * @path: The path to the directory the sysctl table is in.
- * @table: the table structure
+ * register_sysctl_sz - register a sysctl table
+ * @path: The path to the directory the sysctl table is in. If the path
+ * doesn't exist we will create it for you.
+ * @table: the table structure. The calller must ensure the life of the @table
+ * will be kept during the lifetime use of the syctl. It must not be freed
+ * until unregister_sysctl_table() is called with the given returned table
+ * with this registration. If your code is non modular then you don't need
+ * to call unregister_sysctl_table() and can instead use something like
+ * register_sysctl_init() which does not care for the result of the syctl
+ * registration.
+ * @table_size: The number of elements in table.
*
* Register a sysctl table. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
*
* See __register_sysctl_table for more details.
*/
-struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table)
+struct ctl_table_header *register_sysctl_sz(const char *path, const struct ctl_table *table,
+ size_t table_size)
{
return __register_sysctl_table(&sysctl_table_root.default_set,
- path, table);
-}
-EXPORT_SYMBOL(register_sysctl);
-
-static char *append_path(const char *path, char *pos, const char *name)
-{
- int namelen;
- namelen = strlen(name);
- if (((pos - path) + namelen + 2) >= PATH_MAX)
- return NULL;
- memcpy(pos, name, namelen);
- pos[namelen] = '/';
- pos[namelen + 1] = '\0';
- pos += namelen + 1;
- return pos;
-}
-
-static int count_subheaders(struct ctl_table *table)
-{
- int has_files = 0;
- int nr_subheaders = 0;
- struct ctl_table *entry;
-
- /* special case: no directory and empty directory */
- if (!table || !table->procname)
- return 1;
-
- for (entry = table; entry->procname; entry++) {
- if (entry->child)
- nr_subheaders += count_subheaders(entry->child);
- else
- has_files = 1;
- }
- return nr_subheaders + has_files;
-}
-
-static int register_leaf_sysctl_tables(const char *path, char *pos,
- struct ctl_table_header ***subheader, struct ctl_table_set *set,
- struct ctl_table *table)
-{
- struct ctl_table *ctl_table_arg = NULL;
- struct ctl_table *entry, *files;
- int nr_files = 0;
- int nr_dirs = 0;
- int err = -ENOMEM;
-
- for (entry = table; entry->procname; entry++) {
- if (entry->child)
- nr_dirs++;
- else
- nr_files++;
- }
-
- files = table;
- /* If there are mixed files and directories we need a new table */
- if (nr_dirs && nr_files) {
- struct ctl_table *new;
- files = kzalloc(sizeof(struct ctl_table) * (nr_files + 1),
- GFP_KERNEL);
- if (!files)
- goto out;
-
- ctl_table_arg = files;
- for (new = files, entry = table; entry->procname; entry++) {
- if (entry->child)
- continue;
- *new = *entry;
- new++;
- }
- }
-
- /* Register everything except a directory full of subdirectories */
- if (nr_files || !nr_dirs) {
- struct ctl_table_header *header;
- header = __register_sysctl_table(set, path, files);
- if (!header) {
- kfree(ctl_table_arg);
- goto out;
- }
-
- /* Remember if we need to free the file table */
- header->ctl_table_arg = ctl_table_arg;
- **subheader = header;
- (*subheader)++;
- }
-
- /* Recurse into the subdirectories. */
- for (entry = table; entry->procname; entry++) {
- char *child_pos;
-
- if (!entry->child)
- continue;
-
- err = -ENAMETOOLONG;
- child_pos = append_path(path, pos, entry->procname);
- if (!child_pos)
- goto out;
-
- err = register_leaf_sysctl_tables(path, child_pos, subheader,
- set, entry->child);
- pos[0] = '\0';
- if (err)
- goto out;
- }
- err = 0;
-out:
- /* On failure our caller will unregister all registered subheaders */
- return err;
+ path, table, table_size);
}
+EXPORT_SYMBOL(register_sysctl_sz);
/**
- * __register_sysctl_paths - register a sysctl table hierarchy
- * @set: Sysctl tree to register on
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
+ * __register_sysctl_init() - register sysctl table to path
+ * @path: path name for sysctl base. If that path doesn't exist we will create
+ * it for you.
+ * @table: This is the sysctl table that needs to be registered to the path.
+ * The caller must ensure the life of the @table will be kept during the
+ * lifetime use of the sysctl.
+ * @table_name: The name of sysctl table, only used for log printing when
+ * registration fails
+ * @table_size: The number of elements in table
*
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
+ * The sysctl interface is used by userspace to query or modify at runtime
+ * a predefined value set on a variable. These variables however have default
+ * values pre-set. Code which depends on these variables will always work even
+ * if register_sysctl() fails. If register_sysctl() fails you'd just loose the
+ * ability to query or modify the sysctls dynamically at run time. Chances of
+ * register_sysctl() failing on init are extremely low, and so for both reasons
+ * this function does not return any error as it is used by initialization code.
*
- * See __register_sysctl_table for more details.
+ * Context: if your base directory does not exist it will be created for you.
*/
-struct ctl_table_header *__register_sysctl_paths(
- struct ctl_table_set *set,
- const struct ctl_path *path, struct ctl_table *table)
+void __init __register_sysctl_init(const char *path, const struct ctl_table *table,
+ const char *table_name, size_t table_size)
{
- struct ctl_table *ctl_table_arg = table;
- int nr_subheaders = count_subheaders(table);
- struct ctl_table_header *header = NULL, **subheaders, **subheader;
- const struct ctl_path *component;
- char *new_path, *pos;
-
- pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!new_path)
- return NULL;
+ struct ctl_table_header *hdr = register_sysctl_sz(path, table, table_size);
- pos[0] = '\0';
- for (component = path; component->procname; component++) {
- pos = append_path(new_path, pos, component->procname);
- if (!pos)
- goto out;
- }
- while (table->procname && table->child && !table[1].procname) {
- pos = append_path(new_path, pos, table->procname);
- if (!pos)
- goto out;
- table = table->child;
- }
- if (nr_subheaders == 1) {
- header = __register_sysctl_table(set, new_path, table);
- if (header)
- header->ctl_table_arg = ctl_table_arg;
- } else {
- header = kzalloc(sizeof(*header) +
- sizeof(*subheaders)*nr_subheaders, GFP_KERNEL);
- if (!header)
- goto out;
-
- subheaders = (struct ctl_table_header **) (header + 1);
- subheader = subheaders;
- header->ctl_table_arg = ctl_table_arg;
-
- if (register_leaf_sysctl_tables(new_path, pos, &subheader,
- set, table))
- goto err_register_leaves;
- }
-
-out:
- kfree(new_path);
- return header;
-
-err_register_leaves:
- while (subheader > subheaders) {
- struct ctl_table_header *subh = *(--subheader);
- struct ctl_table *table = subh->ctl_table_arg;
- unregister_sysctl_table(subh);
- kfree(table);
+ if (unlikely(!hdr)) {
+ pr_err("failed when register_sysctl_sz %s to %s\n", table_name, path);
+ return;
}
- kfree(header);
- header = NULL;
- goto out;
-}
-
-/**
- * register_sysctl_table_path - register a sysctl table hierarchy
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See __register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
- struct ctl_table *table)
-{
- return __register_sysctl_paths(&sysctl_table_root.default_set,
- path, table);
-}
-EXPORT_SYMBOL(register_sysctl_paths);
-
-/**
- * register_sysctl_table - register a sysctl table hierarchy
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
-{
- static const struct ctl_path null_path[] = { {} };
-
- return register_sysctl_paths(null_path, table);
+ kmemleak_not_leak(hdr);
}
-EXPORT_SYMBOL(register_sysctl_table);
static void put_links(struct ctl_table_header *header)
{
@@ -1582,7 +1477,7 @@ static void put_links(struct ctl_table_header *header)
struct ctl_table_root *root = header->root;
struct ctl_dir *parent = header->parent;
struct ctl_dir *core_parent;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
if (header->set == root_set)
return;
@@ -1591,9 +1486,9 @@ static void put_links(struct ctl_table_header *header)
if (IS_ERR(core_parent))
return;
- for (entry = header->ctl_table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, header) {
struct ctl_table_header *link_head;
- struct ctl_table *link;
+ const struct ctl_table *link;
const char *name = entry->procname;
link = find_entry(&link_head, core_parent, name, strlen(name));
@@ -1605,7 +1500,7 @@ static void put_links(struct ctl_table_header *header)
else {
pr_err("sysctl link missing during unregister: ");
sysctl_print_dir(parent);
- pr_cont("/%s\n", name);
+ pr_cont("%s\n", name);
}
}
}
@@ -1617,8 +1512,11 @@ static void drop_sysctl_table(struct ctl_table_header *header)
if (--header->nreg)
return;
- put_links(header);
- start_unregistering(header);
+ if (parent) {
+ put_links(header);
+ start_unregistering(header);
+ }
+
if (!--header->count)
kfree_rcu(header, rcu);
@@ -1628,35 +1526,18 @@ static void drop_sysctl_table(struct ctl_table_header *header)
/**
* unregister_sysctl_table - unregister a sysctl table hierarchy
- * @header: the header returned from register_sysctl_table
+ * @header: the header returned from register_sysctl or __register_sysctl_table
*
* Unregisters the sysctl table and all children. proc entries may not
* actually be removed until they are no longer used by anyone.
*/
void unregister_sysctl_table(struct ctl_table_header * header)
{
- int nr_subheaders;
might_sleep();
if (header == NULL)
return;
- nr_subheaders = count_subheaders(header->ctl_table_arg);
- if (unlikely(nr_subheaders > 1)) {
- struct ctl_table_header **subheaders;
- int i;
-
- subheaders = (struct ctl_table_header **)(header + 1);
- for (i = nr_subheaders -1; i >= 0; i--) {
- struct ctl_table_header *subh = subheaders[i];
- struct ctl_table *table = subh->ctl_table_arg;
- unregister_sysctl_table(subh);
- kfree(table);
- }
- kfree(header);
- return;
- }
-
spin_lock(&sysctl_lock);
drop_sysctl_table(header);
spin_unlock(&sysctl_lock);
@@ -1669,7 +1550,7 @@ void setup_sysctl_set(struct ctl_table_set *set,
{
memset(set, 0, sizeof(*set));
set->is_seen = is_seen;
- init_header(&set->dir.header, root, set, NULL, root_table);
+ init_header(&set->dir.header, root, set, NULL, root_table, 1);
}
void retire_sysctl_set(struct ctl_table_set *set)
@@ -1683,8 +1564,163 @@ int __init proc_sys_init(void)
proc_sys_root = proc_mkdir("sys", NULL);
proc_sys_root->proc_iops = &proc_sys_dir_operations;
- proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
+ proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations;
proc_sys_root->nlink = 0;
- return sysctl_init();
+ return sysctl_init_bases();
+}
+
+struct sysctl_alias {
+ const char *kernel_param;
+ const char *sysctl_param;
+};
+
+/*
+ * Historically some settings had both sysctl and a command line parameter.
+ * With the generic sysctl. parameter support, we can handle them at a single
+ * place and only keep the historical name for compatibility. This is not meant
+ * to add brand new aliases. When adding existing aliases, consider whether
+ * the possibly different moment of changing the value (e.g. from early_param
+ * to the moment do_sysctl_args() is called) is an issue for the specific
+ * parameter.
+ */
+static const struct sysctl_alias sysctl_aliases[] = {
+ {"hardlockup_all_cpu_backtrace", "kernel.hardlockup_all_cpu_backtrace" },
+ {"hung_task_panic", "kernel.hung_task_panic" },
+ {"numa_zonelist_order", "vm.numa_zonelist_order" },
+ {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" },
+ { }
+};
+
+static const char *sysctl_find_alias(char *param)
+{
+ const struct sysctl_alias *alias;
+
+ for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) {
+ if (strcmp(alias->kernel_param, param) == 0)
+ return alias->sysctl_param;
+ }
+
+ return NULL;
+}
+
+bool sysctl_is_alias(char *param)
+{
+ const char *alias = sysctl_find_alias(param);
+
+ return alias != NULL;
+}
+
+/* Set sysctl value passed on kernel command line. */
+static int process_sysctl_arg(char *param, char *val,
+ const char *unused, void *arg)
+{
+ char *path;
+ struct vfsmount **proc_mnt = arg;
+ struct file_system_type *proc_fs_type;
+ struct file *file;
+ int len;
+ int err;
+ loff_t pos = 0;
+ ssize_t wret;
+
+ if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) {
+ param += sizeof("sysctl") - 1;
+
+ if (param[0] != '/' && param[0] != '.')
+ return 0;
+
+ param++;
+ } else {
+ param = (char *) sysctl_find_alias(param);
+ if (!param)
+ return 0;
+ }
+
+ if (!val)
+ return -EINVAL;
+ len = strlen(val);
+ if (len == 0)
+ return -EINVAL;
+
+ /*
+ * To set sysctl options, we use a temporary mount of proc, look up the
+ * respective sys/ file and write to it. To avoid mounting it when no
+ * options were given, we mount it only when the first sysctl option is
+ * found. Why not a persistent mount? There are problems with a
+ * persistent mount of proc in that it forces userspace not to use any
+ * proc mount options.
+ */
+ if (!*proc_mnt) {
+ proc_fs_type = get_fs_type("proc");
+ if (!proc_fs_type) {
+ pr_err("Failed to find procfs to set sysctl from command line\n");
+ return 0;
+ }
+ *proc_mnt = kern_mount(proc_fs_type);
+ put_filesystem(proc_fs_type);
+ if (IS_ERR(*proc_mnt)) {
+ pr_err("Failed to mount procfs to set sysctl from command line\n");
+ return 0;
+ }
+ }
+
+ path = kasprintf(GFP_KERNEL, "sys/%s", param);
+ if (!path)
+ panic("%s: Failed to allocate path for %s\n", __func__, param);
+ strreplace(path, '.', '/');
+
+ file = file_open_root_mnt(*proc_mnt, path, O_WRONLY, 0);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ if (err == -ENOENT)
+ pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n",
+ param, val);
+ else if (err == -EACCES)
+ pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n",
+ param, val);
+ else
+ pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n",
+ file, param, val);
+ goto out;
+ }
+ wret = kernel_write(file, val, len, &pos);
+ if (wret < 0) {
+ err = wret;
+ if (err == -EINVAL)
+ pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n",
+ param, val);
+ else
+ pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n",
+ ERR_PTR(err), param, val);
+ } else if (wret != len) {
+ pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n",
+ wret, len, path, param, val);
+ }
+
+ err = filp_close(file, NULL);
+ if (err)
+ pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n",
+ ERR_PTR(err), param, val);
+out:
+ kfree(path);
+ return 0;
+}
+
+void do_sysctl_args(void)
+{
+ char *command_line;
+ struct vfsmount *proc_mnt = NULL;
+
+ command_line = kstrdup(saved_command_line, GFP_KERNEL);
+ if (!command_line)
+ panic("%s: Failed to allocate copy of command line\n", __func__);
+
+ parse_args("Setting sysctl args", command_line,
+ NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg);
+
+ if (proc_mnt)
+ kern_unmount(proc_mnt);
+
+ kfree(command_line);
}
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 901bd06f437d..5c6a5ceab2f1 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -1,10 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* proc_tty.c -- handles /proc/tty
*
* Copyright 1997, Theodore Ts'o
*/
-
-#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/errno.h>
@@ -14,6 +13,7 @@
#include <linux/tty.h>
#include <linux/seq_file.h>
#include <linux/bitops.h>
+#include "internal.h"
/*
* The /proc/tty directory inodes...
@@ -124,18 +124,6 @@ static const struct seq_operations tty_drivers_op = {
.show = show_tty_driver
};
-static int tty_drivers_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &tty_drivers_op);
-}
-
-static const struct file_operations proc_tty_drivers_operations = {
- .open = tty_drivers_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
/*
* This function is called by tty_register_driver() to handle
* registering the driver's /proc handler into /proc/tty/driver/<foo>
@@ -145,11 +133,11 @@ void proc_tty_register_driver(struct tty_driver *driver)
struct proc_dir_entry *ent;
if (!driver->driver_name || driver->proc_entry ||
- !driver->ops->proc_fops)
+ !driver->ops->proc_show)
return;
- ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
- driver->ops->proc_fops, driver);
+ ent = proc_create_single_data(driver->driver_name, 0, proc_tty_driver,
+ driver->ops->proc_show, driver);
driver->proc_entry = ent;
}
@@ -164,7 +152,7 @@ void proc_tty_unregister_driver(struct tty_driver *driver)
if (!ent)
return;
- remove_proc_entry(driver->driver_name, proc_tty_driver);
+ remove_proc_entry(ent->name, proc_tty_driver);
driver->proc_entry = NULL;
}
@@ -184,6 +172,6 @@ void __init proc_tty_init(void)
* entry.
*/
proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL);
- proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops);
- proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations);
+ proc_create_seq("tty/ldiscs", 0, NULL, &tty_ldiscs_seq_ops);
+ proc_create_seq("tty/drivers", 0, NULL, &tty_drivers_op);
}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index deecb397daa3..d8ca41d823e4 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/proc/root.c
*
@@ -5,9 +6,6 @@
*
* proc root directory handling functions
*/
-
-#include <linux/uaccess.h>
-
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
@@ -18,127 +16,361 @@
#include <linux/module.h>
#include <linux/bitops.h>
#include <linux/user_namespace.h>
+#include <linux/fs_context.h>
#include <linux/mount.h>
#include <linux/pid_namespace.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
#include <linux/cred.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
#include "internal.h"
-enum {
- Opt_gid, Opt_hidepid, Opt_err,
+struct proc_fs_context {
+ struct pid_namespace *pid_ns;
+ unsigned int mask;
+ enum proc_hidepid hidepid;
+ int gid;
+ enum proc_pidonly pidonly;
};
-static const match_table_t tokens = {
- {Opt_hidepid, "hidepid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_err, NULL},
+enum proc_param {
+ Opt_gid,
+ Opt_hidepid,
+ Opt_subset,
+ Opt_pidns,
};
-int proc_parse_options(char *options, struct pid_namespace *pid)
-{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
-
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
-
- args[0].to = args[0].from = NULL;
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_gid:
- if (match_int(&args[0], &option))
- return 0;
- pid->pid_gid = make_kgid(current_user_ns(), option);
- break;
- case Opt_hidepid:
- if (match_int(&args[0], &option))
- return 0;
- if (option < HIDEPID_OFF ||
- option > HIDEPID_INVISIBLE) {
- pr_err("proc: hidepid value must be between 0 and 2.\n");
- return 0;
+static const struct fs_parameter_spec proc_fs_parameters[] = {
+ fsparam_u32("gid", Opt_gid),
+ fsparam_string("hidepid", Opt_hidepid),
+ fsparam_string("subset", Opt_subset),
+ fsparam_file_or_string("pidns", Opt_pidns),
+ {}
+};
+
+static inline int valid_hidepid(unsigned int value)
+{
+ return (value == HIDEPID_OFF ||
+ value == HIDEPID_NO_ACCESS ||
+ value == HIDEPID_INVISIBLE ||
+ value == HIDEPID_NOT_PTRACEABLE);
+}
+
+static int proc_parse_hidepid_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+ struct fs_parameter_spec hidepid_u32_spec = fsparam_u32("hidepid", Opt_hidepid);
+ struct fs_parse_result result;
+ int base = (unsigned long)hidepid_u32_spec.data;
+
+ if (param->type != fs_value_is_string)
+ return invalf(fc, "proc: unexpected type of hidepid value\n");
+
+ if (!kstrtouint(param->string, base, &result.uint_32)) {
+ if (!valid_hidepid(result.uint_32))
+ return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string);
+ ctx->hidepid = result.uint_32;
+ return 0;
+ }
+
+ if (!strcmp(param->string, "off"))
+ ctx->hidepid = HIDEPID_OFF;
+ else if (!strcmp(param->string, "noaccess"))
+ ctx->hidepid = HIDEPID_NO_ACCESS;
+ else if (!strcmp(param->string, "invisible"))
+ ctx->hidepid = HIDEPID_INVISIBLE;
+ else if (!strcmp(param->string, "ptraceable"))
+ ctx->hidepid = HIDEPID_NOT_PTRACEABLE;
+ else
+ return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string);
+
+ return 0;
+}
+
+static int proc_parse_subset_param(struct fs_context *fc, char *value)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+
+ while (value) {
+ char *ptr = strchr(value, ',');
+
+ if (ptr != NULL)
+ *ptr++ = '\0';
+
+ if (*value != '\0') {
+ if (!strcmp(value, "pid")) {
+ ctx->pidonly = PROC_PIDONLY_ON;
+ } else {
+ return invalf(fc, "proc: unsupported subset option - %s\n", value);
}
- pid->hide_pid = option;
- break;
- default:
- pr_err("proc: unrecognized mount option \"%s\" "
- "or missing value\n", p);
- return 0;
}
+ value = ptr;
}
- return 1;
+ return 0;
}
-int proc_remount(struct super_block *sb, int *flags, char *data)
+#ifdef CONFIG_PID_NS
+static int proc_parse_pidns_param(struct fs_context *fc,
+ struct fs_parameter *param,
+ struct fs_parse_result *result)
{
- struct pid_namespace *pid = sb->s_fs_info;
+ struct proc_fs_context *ctx = fc->fs_private;
+ struct pid_namespace *target, *active = task_active_pid_ns(current);
+ struct ns_common *ns;
+ struct file *ns_filp __free(fput) = NULL;
+
+ switch (param->type) {
+ case fs_value_is_file:
+ /* came through fsconfig, steal the file reference */
+ ns_filp = no_free_ptr(param->file);
+ break;
+ case fs_value_is_string:
+ ns_filp = filp_open(param->string, O_RDONLY, 0);
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ break;
+ }
+ if (!ns_filp)
+ ns_filp = ERR_PTR(-EBADF);
+ if (IS_ERR(ns_filp)) {
+ errorfc(fc, "could not get file from pidns argument");
+ return PTR_ERR(ns_filp);
+ }
- sync_filesystem(sb);
- return !proc_parse_options(data, pid);
+ if (!proc_ns_file(ns_filp))
+ return invalfc(fc, "pidns argument is not an nsfs file");
+ ns = get_proc_ns(file_inode(ns_filp));
+ if (ns->ns_type != CLONE_NEWPID)
+ return invalfc(fc, "pidns argument is not a pidns file");
+ target = container_of(ns, struct pid_namespace, ns);
+
+ /*
+ * pidns= is shorthand for joining the pidns to get a fsopen fd, so the
+ * permission model should be the same as pidns_install().
+ */
+ if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) {
+ errorfc(fc, "insufficient permissions to set pidns");
+ return -EPERM;
+ }
+ if (!pidns_is_ancestor(target, active))
+ return invalfc(fc, "cannot set pidns to non-descendant pidns");
+
+ put_pid_ns(ctx->pid_ns);
+ ctx->pid_ns = get_pid_ns(target);
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
+ return 0;
}
+#endif /* CONFIG_PID_NS */
-static struct dentry *proc_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- struct pid_namespace *ns;
+ struct proc_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt, err;
+
+ opt = fs_parse(fc, proc_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_gid:
+ ctx->gid = result.uint_32;
+ break;
+
+ case Opt_hidepid:
+ err = proc_parse_hidepid_param(fc, param);
+ if (err)
+ return err;
+ break;
+
+ case Opt_subset:
+ err = proc_parse_subset_param(fc, param->string);
+ if (err)
+ return err;
+ break;
+
+ case Opt_pidns:
+#ifdef CONFIG_PID_NS
+ /*
+ * We would have to RCU-protect every proc_pid_ns() or
+ * proc_sb_info() access if we allowed this to be reconfigured
+ * for an existing procfs instance. Luckily, procfs instances
+ * are cheap to create, and mount-beneath would let you
+ * atomically replace an instance even with overmounts.
+ */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ errorfc(fc, "cannot reconfigure pidns for existing procfs");
+ return -EBUSY;
+ }
+ err = proc_parse_pidns_param(fc, param, &result);
+ if (err)
+ return err;
+ break;
+#else
+ errorfc(fc, "pidns mount flag not supported on this system");
+ return -EOPNOTSUPP;
+#endif
+
+ default:
+ return -EINVAL;
+ }
+
+ ctx->mask |= 1 << opt;
+ return 0;
+}
+
+static void proc_apply_options(struct proc_fs_info *fs_info,
+ struct fs_context *fc,
+ struct user_namespace *user_ns)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+
+ if (ctx->mask & (1 << Opt_gid))
+ fs_info->pid_gid = make_kgid(user_ns, ctx->gid);
+ if (ctx->mask & (1 << Opt_hidepid))
+ fs_info->hide_pid = ctx->hidepid;
+ if (ctx->mask & (1 << Opt_subset))
+ fs_info->pidonly = ctx->pidonly;
+ if (ctx->mask & (1 << Opt_pidns) &&
+ !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) {
+ put_pid_ns(fs_info->pid_ns);
+ fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
+ }
+}
+
+static int proc_fill_super(struct super_block *s, struct fs_context *fc)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+ struct inode *root_inode;
+ struct proc_fs_info *fs_info;
+ int ret;
+
+ fs_info = kzalloc(sizeof(*fs_info), GFP_KERNEL);
+ if (!fs_info)
+ return -ENOMEM;
+
+ fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
+ proc_apply_options(fs_info, fc, current_user_ns());
+
+ /* User space would break if executables or devices appear on proc */
+ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
+ s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
+ s->s_blocksize = 1024;
+ s->s_blocksize_bits = 10;
+ s->s_magic = PROC_SUPER_MAGIC;
+ s->s_op = &proc_sops;
+ s->s_time_gran = 1;
+ s->s_fs_info = fs_info;
+
+ /*
+ * procfs isn't actually a stacking filesystem; however, there is
+ * too much magic going on inside it to permit stacking things on
+ * top of it
+ */
+ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
+
+ /* procfs dentries and inodes don't require IO to create */
+ s->s_shrink->seeks = 0;
+
+ pde_get(&proc_root);
+ root_inode = proc_get_inode(s, &proc_root);
+ if (!root_inode) {
+ pr_err("proc_fill_super: get root inode failed\n");
+ return -ENOMEM;
+ }
- if (flags & MS_KERNMOUNT) {
- ns = data;
- data = NULL;
- } else {
- ns = task_active_pid_ns(current);
+ s->s_root = d_make_root(root_inode);
+ if (!s->s_root) {
+ pr_err("proc_fill_super: allocate dentry failed\n");
+ return -ENOMEM;
}
- return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super);
+ ret = proc_setup_self(s);
+ if (ret) {
+ return ret;
+ }
+ return proc_setup_thread_self(s);
+}
+
+static int proc_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+ struct proc_fs_info *fs_info = proc_sb_info(sb);
+
+ sync_filesystem(sb);
+
+ proc_apply_options(fs_info, fc, current_user_ns());
+ return 0;
+}
+
+static int proc_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, proc_fill_super);
+}
+
+static void proc_fs_context_free(struct fs_context *fc)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+
+ put_pid_ns(ctx->pid_ns);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations proc_fs_context_ops = {
+ .free = proc_fs_context_free,
+ .parse_param = proc_parse_param,
+ .get_tree = proc_get_tree,
+ .reconfigure = proc_reconfigure,
+};
+
+static int proc_init_fs_context(struct fs_context *fc)
+{
+ struct proc_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->pid_ns = get_pid_ns(task_active_pid_ns(current));
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
+ fc->fs_private = ctx;
+ fc->ops = &proc_fs_context_ops;
+ return 0;
}
static void proc_kill_sb(struct super_block *sb)
{
- struct pid_namespace *ns;
+ struct proc_fs_info *fs_info = proc_sb_info(sb);
- ns = (struct pid_namespace *)sb->s_fs_info;
- if (ns->proc_self)
- dput(ns->proc_self);
- if (ns->proc_thread_self)
- dput(ns->proc_thread_self);
kill_anon_super(sb);
- put_pid_ns(ns);
+ if (fs_info) {
+ put_pid_ns(fs_info->pid_ns);
+ kfree_rcu(fs_info, rcu);
+ }
}
static struct file_system_type proc_fs_type = {
- .name = "proc",
- .mount = proc_mount,
- .kill_sb = proc_kill_sb,
- .fs_flags = FS_USERNS_MOUNT,
+ .name = "proc",
+ .init_fs_context = proc_init_fs_context,
+ .parameters = proc_fs_parameters,
+ .kill_sb = proc_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM,
};
void __init proc_root_init(void)
{
- int err;
-
- proc_init_inodecache();
+ proc_init_kmemcache();
set_proc_pid_nlink();
- err = register_filesystem(&proc_fs_type);
- if (err)
- return;
-
proc_self_init();
proc_thread_self_init();
proc_symlink("mounts", NULL, "self/mounts");
proc_net_init();
-
-#ifdef CONFIG_SYSVIPC
- proc_mkdir("sysvipc", NULL);
-#endif
proc_mkdir("fs", NULL);
proc_mkdir("driver", NULL);
proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */
@@ -149,21 +381,30 @@ void __init proc_root_init(void)
proc_tty_init();
proc_mkdir("bus", NULL);
proc_sys_init();
+
+ /*
+ * Last things last. It is not like userspace processes eager
+ * to open /proc files exist at this point but register last
+ * anyway.
+ */
+ register_filesystem(&proc_fs_type);
}
-static int proc_root_getattr(const struct path *path, struct kstat *stat,
+static int proc_root_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
- generic_fillattr(d_inode(path->dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry),
+ stat);
stat->nlink = proc_root.nlink + nr_processes();
return 0;
}
static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
{
- if (!proc_pid_lookup(dir, dentry, flags))
+ if (!proc_pid_lookup(dentry, flags))
return NULL;
-
+
return proc_lookup(dir, dentry, flags);
}
@@ -202,31 +443,14 @@ static const struct inode_operations proc_root_inode_operations = {
* This is the root "inode" in the /proc tree..
*/
struct proc_dir_entry proc_root = {
- .low_ino = PROC_ROOT_INO,
- .namelen = 5,
- .mode = S_IFDIR | S_IRUGO | S_IXUGO,
- .nlink = 2,
- .count = ATOMIC_INIT(1),
- .proc_iops = &proc_root_inode_operations,
- .proc_fops = &proc_root_operations,
+ .low_ino = PROCFS_ROOT_INO,
+ .namelen = 5,
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ .nlink = 2,
+ .refcnt = REFCOUNT_INIT(1),
+ .proc_iops = &proc_root_inode_operations,
+ .proc_dir_ops = &proc_root_operations,
.parent = &proc_root,
.subdir = RB_ROOT,
.name = "/proc",
};
-
-int pid_ns_prepare_proc(struct pid_namespace *ns)
-{
- struct vfsmount *mnt;
-
- mnt = kern_mount_data(&proc_fs_type, ns);
- if (IS_ERR(mnt))
- return PTR_ERR(mnt);
-
- ns->proc_mnt = mnt;
- return 0;
-}
-
-void pid_ns_release_proc(struct pid_namespace *ns)
-{
- kern_unmount(ns->proc_mnt);
-}
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 39857f6db5cf..62d2c0cfe35c 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cache.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/pid_namespace.h>
@@ -10,17 +12,17 @@ static const char *proc_self_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- struct pid_namespace *ns = inode->i_sb->s_fs_info;
+ struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
pid_t tgid = task_tgid_nr_ns(current, ns);
char *name;
if (!tgid)
return ERR_PTR(-ENOENT);
- /* 11 for max length of signed int in decimal + NULL term */
- name = kmalloc(12, dentry ? GFP_KERNEL : GFP_ATOMIC);
+ /* max length of unsigned int in decimal + NULL term */
+ name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
if (unlikely(!name))
return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
- sprintf(name, "%d", tgid);
+ sprintf(name, "%u", tgid);
set_delayed_call(done, kfree_link, name);
return name;
}
@@ -29,40 +31,36 @@ static const struct inode_operations proc_self_inode_operations = {
.get_link = proc_self_get_link,
};
-static unsigned self_inum;
+unsigned self_inum __ro_after_init;
int proc_setup_self(struct super_block *s)
{
struct inode *root_inode = d_inode(s->s_root);
- struct pid_namespace *ns = s->s_fs_info;
struct dentry *self;
-
+ int ret = -ENOMEM;
+
inode_lock(root_inode);
self = d_alloc_name(s->s_root, "self");
if (self) {
- struct inode *inode = new_inode_pseudo(s);
+ struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = self_inum;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ simple_inode_init_ts(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_op = &proc_self_inode_operations;
- d_add(self, inode);
- } else {
- dput(self);
- self = ERR_PTR(-ENOMEM);
+ d_make_persistent(self, inode);
+ ret = 0;
}
- } else {
- self = ERR_PTR(-ENOMEM);
+ dput(self);
}
inode_unlock(root_inode);
- if (IS_ERR(self)) {
+
+ if (ret)
pr_err("proc_fill_super: can't allocate /proc/self\n");
- return PTR_ERR(self);
- }
- ns->proc_self = self;
- return 0;
+
+ return ret;
}
void __init proc_self_init(void)
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index ad8a77f94beb..04bb29721419 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -1,7 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
#include <linux/kernel_stat.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include "internal.h"
/*
* /proc/softirqs ... display the number of softirqs
@@ -18,27 +20,18 @@ static int show_softirqs(struct seq_file *p, void *v)
for (i = 0; i < NR_SOFTIRQS; i++) {
seq_printf(p, "%12s:", softirq_to_name[i]);
for_each_possible_cpu(j)
- seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
+ seq_put_decimal_ull_width(p, " ", kstat_softirqs_cpu(i, j), 10);
seq_putc(p, '\n');
}
return 0;
}
-static int softirqs_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_softirqs, NULL);
-}
-
-static const struct file_operations proc_softirqs_operations = {
- .open = softirqs_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int __init proc_softirqs_init(void)
{
- proc_create("softirqs", 0, NULL, &proc_softirqs_operations);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("softirqs", 0, NULL, show_softirqs);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_softirqs_init);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index bd4e55f4aa20..8b444e862319 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/cpumask.h>
#include <linux/fs.h>
#include <linux/init.h>
@@ -9,6 +10,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/time.h>
+#include <linux/time_namespace.h>
#include <linux/irqnr.h>
#include <linux/sched/cputime.h>
#include <linux/tick.h>
@@ -20,31 +22,7 @@
#define arch_irq_stat() 0
#endif
-#ifdef arch_idle_time
-
-static u64 get_idle_time(int cpu)
-{
- u64 idle;
-
- idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
- if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
- idle += arch_idle_time(cpu);
- return idle;
-}
-
-static u64 get_iowait_time(int cpu)
-{
- u64 iowait;
-
- iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
- if (cpu_online(cpu) && nr_iowait_cpu(cpu))
- iowait += arch_idle_time(cpu);
- return iowait;
-}
-
-#else
-
-static u64 get_idle_time(int cpu)
+u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle, idle_usecs = -1ULL;
@@ -53,14 +31,14 @@ static u64 get_idle_time(int cpu)
if (idle_usecs == -1ULL)
/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
- idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ idle = kcs->cpustat[CPUTIME_IDLE];
else
idle = idle_usecs * NSEC_PER_USEC;
return idle;
}
-static u64 get_iowait_time(int cpu)
+static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
{
u64 iowait, iowait_usecs = -1ULL;
@@ -69,14 +47,37 @@ static u64 get_iowait_time(int cpu)
if (iowait_usecs == -1ULL)
/* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
- iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+ iowait = kcs->cpustat[CPUTIME_IOWAIT];
else
iowait = iowait_usecs * NSEC_PER_USEC;
return iowait;
}
-#endif
+static void show_irq_gap(struct seq_file *p, unsigned int gap)
+{
+ static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0";
+
+ while (gap > 0) {
+ unsigned int inc;
+
+ inc = min_t(unsigned int, gap, ARRAY_SIZE(zeros) / 2);
+ seq_write(p, zeros, 2 * inc);
+ gap -= inc;
+ }
+}
+
+static void show_all_irqs(struct seq_file *p)
+{
+ unsigned int i, next = 0;
+
+ for_each_active_irq(i) {
+ show_irq_gap(p, i - next);
+ seq_put_decimal_ull(p, " ", kstat_irqs_usr(i));
+ next = i + 1;
+ }
+ show_irq_gap(p, irq_get_nr_irqs() - next);
+}
static int show_stat(struct seq_file *p, void *v)
{
@@ -92,20 +93,27 @@ static int show_stat(struct seq_file *p, void *v)
irq = softirq = steal = 0;
guest = guest_nice = 0;
getboottime64(&boottime);
+ /* shift boot timestamp according to the timens offset */
+ timens_sub_boottime(&boottime);
for_each_possible_cpu(i) {
- user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
- nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
- system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
- idle += get_idle_time(i);
- iowait += get_iowait_time(i);
- irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
- softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
- steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
- guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
- guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
- sum += kstat_cpu_irqs_sum(i);
- sum += arch_irq_stat_cpu(i);
+ struct kernel_cpustat kcpustat;
+ u64 *cpustat = kcpustat.cpustat;
+
+ kcpustat_cpu_fetch(&kcpustat, i);
+
+ user += cpustat[CPUTIME_USER];
+ nice += cpustat[CPUTIME_NICE];
+ system += cpustat[CPUTIME_SYSTEM];
+ idle += get_idle_time(&kcpustat, i);
+ iowait += get_iowait_time(&kcpustat, i);
+ irq += cpustat[CPUTIME_IRQ];
+ softirq += cpustat[CPUTIME_SOFTIRQ];
+ steal += cpustat[CPUTIME_STEAL];
+ guest += cpustat[CPUTIME_GUEST];
+ guest_nice += cpustat[CPUTIME_GUEST_NICE];
+ sum += kstat_cpu_irqs_sum(i);
+ sum += arch_irq_stat_cpu(i);
for (j = 0; j < NR_SOFTIRQS; j++) {
unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
@@ -129,17 +137,22 @@ static int show_stat(struct seq_file *p, void *v)
seq_putc(p, '\n');
for_each_online_cpu(i) {
+ struct kernel_cpustat kcpustat;
+ u64 *cpustat = kcpustat.cpustat;
+
+ kcpustat_cpu_fetch(&kcpustat, i);
+
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
- user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
- nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
- system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
- idle = get_idle_time(i);
- iowait = get_iowait_time(i);
- irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
- softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
- steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
- guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
- guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+ user = cpustat[CPUTIME_USER];
+ nice = cpustat[CPUTIME_NICE];
+ system = cpustat[CPUTIME_SYSTEM];
+ idle = get_idle_time(&kcpustat, i);
+ iowait = get_iowait_time(&kcpustat, i);
+ irq = cpustat[CPUTIME_IRQ];
+ softirq = cpustat[CPUTIME_SOFTIRQ];
+ steal = cpustat[CPUTIME_STEAL];
+ guest = cpustat[CPUTIME_GUEST];
+ guest_nice = cpustat[CPUTIME_GUEST_NICE];
seq_printf(p, "cpu%d", i);
seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
@@ -155,16 +168,14 @@ static int show_stat(struct seq_file *p, void *v)
}
seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
- /* sum again ? it could be updated? */
- for_each_irq_nr(j)
- seq_put_decimal_ull(p, " ", kstat_irqs_usr(j));
+ show_all_irqs(p);
seq_printf(p,
"\nctxt %llu\n"
"btime %llu\n"
"processes %lu\n"
- "procs_running %lu\n"
- "procs_blocked %lu\n",
+ "procs_running %u\n"
+ "procs_blocked %u\n",
nr_context_switches(),
(unsigned long long)boottime.tv_sec,
total_forks,
@@ -182,23 +193,24 @@ static int show_stat(struct seq_file *p, void *v)
static int stat_open(struct inode *inode, struct file *file)
{
- size_t size = 1024 + 128 * num_online_cpus();
+ unsigned int size = 1024 + 128 * num_online_cpus();
/* minimum size to display an interrupt count : 2 bytes */
- size += 2 * nr_irqs;
+ size += 2 * irq_get_nr_irqs();
return single_open_size(file, show_stat, NULL, size);
}
-static const struct file_operations proc_stat_operations = {
- .open = stat_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
+static const struct proc_ops stat_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_open = stat_open,
+ .proc_read_iter = seq_read_iter,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
};
static int __init proc_stat_init(void)
{
- proc_create("stat", 0, NULL, &proc_stat_operations);
+ proc_create("stat", 0, NULL, &stat_proc_ops);
return 0;
}
fs_initcall(proc_stat_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b836fd61ed87..81dfc26bfae8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,8 +1,10 @@
-#include <linux/mm.h>
-#include <linux/vmacache.h>
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/pagewalk.h>
+#include <linux/mm_inline.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/mount.h>
+#include <linux/ksm.h>
#include <linux/seq_file.h>
#include <linux/highmem.h>
#include <linux/ptrace.h>
@@ -12,24 +14,34 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/sched/mm.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
+#include <linux/uaccess.h>
+#include <linux/pkeys.h>
+#include <linux/minmax.h>
+#include <linux/overflow.h>
+#include <linux/buildid.h>
#include <asm/elf.h>
-#include <linux/uaccess.h>
+#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include "internal.h"
+#define SENTINEL_VMA_END -1
+#define SENTINEL_VMA_GATE -2
+
+#define SEQ_PUT_DEC(str, val) \
+ seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8)
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
- unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
+ unsigned long text, lib, swap, anon, file, shmem;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
- anon = get_mm_counter(mm, MM_ANONPAGES);
- file = get_mm_counter(mm, MM_FILEPAGES);
- shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+ anon = get_mm_counter_sum(mm, MM_ANONPAGES);
+ file = get_mm_counter_sum(mm, MM_FILEPAGES);
+ shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES);
/*
* Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -45,44 +57,34 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
if (hiwater_rss < mm->hiwater_rss)
hiwater_rss = mm->hiwater_rss;
- text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
- lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
- swap = get_mm_counter(mm, MM_SWAPENTS);
- ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
- pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
- seq_printf(m,
- "VmPeak:\t%8lu kB\n"
- "VmSize:\t%8lu kB\n"
- "VmLck:\t%8lu kB\n"
- "VmPin:\t%8lu kB\n"
- "VmHWM:\t%8lu kB\n"
- "VmRSS:\t%8lu kB\n"
- "RssAnon:\t%8lu kB\n"
- "RssFile:\t%8lu kB\n"
- "RssShmem:\t%8lu kB\n"
- "VmData:\t%8lu kB\n"
- "VmStk:\t%8lu kB\n"
- "VmExe:\t%8lu kB\n"
- "VmLib:\t%8lu kB\n"
- "VmPTE:\t%8lu kB\n"
- "VmPMD:\t%8lu kB\n"
- "VmSwap:\t%8lu kB\n",
- hiwater_vm << (PAGE_SHIFT-10),
- total_vm << (PAGE_SHIFT-10),
- mm->locked_vm << (PAGE_SHIFT-10),
- mm->pinned_vm << (PAGE_SHIFT-10),
- hiwater_rss << (PAGE_SHIFT-10),
- total_rss << (PAGE_SHIFT-10),
- anon << (PAGE_SHIFT-10),
- file << (PAGE_SHIFT-10),
- shmem << (PAGE_SHIFT-10),
- mm->data_vm << (PAGE_SHIFT-10),
- mm->stack_vm << (PAGE_SHIFT-10), text, lib,
- ptes >> 10,
- pmds >> 10,
- swap << (PAGE_SHIFT-10));
+ /* split executable areas between text and lib */
+ text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK);
+ text = min(text, mm->exec_vm << PAGE_SHIFT);
+ lib = (mm->exec_vm << PAGE_SHIFT) - text;
+
+ swap = get_mm_counter_sum(mm, MM_SWAPENTS);
+ SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
+ SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
+ SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
+ SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm));
+ SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
+ SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
+ SEQ_PUT_DEC(" kB\nRssAnon:\t", anon);
+ SEQ_PUT_DEC(" kB\nRssFile:\t", file);
+ SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem);
+ SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm);
+ SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm);
+ seq_put_decimal_ull_width(m,
+ " kB\nVmExe:\t", text >> 10, 8);
+ seq_put_decimal_ull_width(m,
+ " kB\nVmLib:\t", lib >> 10, 8);
+ seq_put_decimal_ull_width(m,
+ " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8);
+ SEQ_PUT_DEC(" kB\nVmSwap:\t", swap);
+ seq_puts(m, " kB\n");
hugetlb_report_usage(m, mm);
}
+#undef SEQ_PUT_DEC
unsigned long task_vsize(struct mm_struct *mm)
{
@@ -93,12 +95,12 @@ unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
- *shared = get_mm_counter(mm, MM_FILEPAGES) +
- get_mm_counter(mm, MM_SHMEMPAGES);
+ *shared = get_mm_counter_sum(mm, MM_FILEPAGES) +
+ get_mm_counter_sum(mm, MM_SHMEMPAGES);
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
*data = mm->data_vm + mm->stack_vm;
- *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
+ *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES);
return mm->total_vm;
}
@@ -128,100 +130,214 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
}
#endif
-static void vma_stop(struct proc_maps_private *priv)
+#ifdef CONFIG_PER_VMA_LOCK
+
+static void reset_lock_ctx(struct proc_maps_locking_ctx *lock_ctx)
{
- struct mm_struct *mm = priv->mm;
+ lock_ctx->locked_vma = NULL;
+ lock_ctx->mmap_locked = false;
+}
- release_task_mempolicy(priv);
- up_read(&mm->mmap_sem);
- mmput(mm);
+static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx)
+{
+ if (lock_ctx->locked_vma) {
+ vma_end_read(lock_ctx->locked_vma);
+ lock_ctx->locked_vma = NULL;
+ }
}
-static struct vm_area_struct *
-m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
+static const struct seq_operations proc_pid_maps_op;
+
+static inline bool lock_vma_range(struct seq_file *m,
+ struct proc_maps_locking_ctx *lock_ctx)
{
- if (vma == priv->tail_vma)
- return NULL;
- return vma->vm_next ?: priv->tail_vma;
+ /*
+ * smaps and numa_maps perform page table walk, therefore require
+ * mmap_lock but maps can be read with locking just the vma and
+ * walking the vma tree under rcu read protection.
+ */
+ if (m->op != &proc_pid_maps_op) {
+ if (mmap_read_lock_killable(lock_ctx->mm))
+ return false;
+
+ lock_ctx->mmap_locked = true;
+ } else {
+ rcu_read_lock();
+ reset_lock_ctx(lock_ctx);
+ }
+
+ return true;
+}
+
+static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
+{
+ if (lock_ctx->mmap_locked) {
+ mmap_read_unlock(lock_ctx->mm);
+ } else {
+ unlock_ctx_vma(lock_ctx);
+ rcu_read_unlock();
+ }
}
-static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
+static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
+ loff_t last_pos)
{
- if (m->count < m->size) /* vma is copied successfully */
- m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL;
+ struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
+ struct vm_area_struct *vma;
+
+ if (lock_ctx->mmap_locked)
+ return vma_next(&priv->iter);
+
+ unlock_ctx_vma(lock_ctx);
+ vma = lock_next_vma(lock_ctx->mm, &priv->iter, last_pos);
+ if (!IS_ERR_OR_NULL(vma))
+ lock_ctx->locked_vma = vma;
+
+ return vma;
+}
+
+static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
+ loff_t pos)
+{
+ struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
+
+ if (lock_ctx->mmap_locked)
+ return false;
+
+ rcu_read_unlock();
+ mmap_read_lock(lock_ctx->mm);
+ /* Reinitialize the iterator after taking mmap_lock */
+ vma_iter_set(&priv->iter, pos);
+ lock_ctx->mmap_locked = true;
+
+ return true;
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool lock_vma_range(struct seq_file *m,
+ struct proc_maps_locking_ctx *lock_ctx)
+{
+ return mmap_read_lock_killable(lock_ctx->mm) == 0;
+}
+
+static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
+{
+ mmap_read_unlock(lock_ctx->mm);
+}
+
+static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
+ loff_t last_pos)
+{
+ return vma_next(&priv->iter);
+}
+
+static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
+ loff_t pos)
+{
+ return false;
+}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
+static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos)
+{
+ struct proc_maps_private *priv = m->private;
+ struct vm_area_struct *vma;
+
+retry:
+ vma = get_next_vma(priv, *ppos);
+ /* EINTR of EAGAIN is possible */
+ if (IS_ERR(vma)) {
+ if (PTR_ERR(vma) == -EAGAIN && fallback_to_mmap_lock(priv, *ppos))
+ goto retry;
+
+ return vma;
+ }
+
+ /* Store previous position to be able to restart if needed */
+ priv->last_pos = *ppos;
+ if (vma) {
+ /*
+ * Track the end of the reported vma to ensure position changes
+ * even if previous vma was merged with the next vma and we
+ * found the extended vma with the same vm_start.
+ */
+ *ppos = vma->vm_end;
+ } else {
+ *ppos = SENTINEL_VMA_GATE;
+ vma = get_gate_vma(priv->lock_ctx.mm);
+ }
+
+ return vma;
}
static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
- unsigned long last_addr = m->version;
+ struct proc_maps_locking_ctx *lock_ctx;
+ loff_t last_addr = *ppos;
struct mm_struct *mm;
- struct vm_area_struct *vma;
- unsigned int pos = *ppos;
- /* See m_cache_vma(). Zero at the start or after lseek. */
- if (last_addr == -1UL)
+ /* See m_next(). Zero at the start or after lseek. */
+ if (last_addr == SENTINEL_VMA_END)
return NULL;
priv->task = get_proc_task(priv->inode);
if (!priv->task)
return ERR_PTR(-ESRCH);
- mm = priv->mm;
- if (!mm || !mmget_not_zero(mm))
+ lock_ctx = &priv->lock_ctx;
+ mm = lock_ctx->mm;
+ if (!mm || !mmget_not_zero(mm)) {
+ put_task_struct(priv->task);
+ priv->task = NULL;
return NULL;
-
- down_read(&mm->mmap_sem);
- hold_task_mempolicy(priv);
- priv->tail_vma = get_gate_vma(mm);
-
- if (last_addr) {
- vma = find_vma(mm, last_addr - 1);
- if (vma && vma->vm_start <= last_addr)
- vma = m_next_vma(priv, vma);
- if (vma)
- return vma;
}
- m->version = 0;
- if (pos < mm->map_count) {
- for (vma = mm->mmap; pos; pos--) {
- m->version = vma->vm_start;
- vma = vma->vm_next;
- }
- return vma;
+ if (!lock_vma_range(m, lock_ctx)) {
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
+ return ERR_PTR(-EINTR);
}
- /* we do not bother to update m->version in this case */
- if (pos == mm->map_count && priv->tail_vma)
- return priv->tail_vma;
+ /*
+ * Reset current position if last_addr was set before
+ * and it's not a sentinel.
+ */
+ if (last_addr > 0)
+ *ppos = last_addr = priv->last_pos;
+ vma_iter_init(&priv->iter, mm, (unsigned long)last_addr);
+ hold_task_mempolicy(priv);
+ if (last_addr == SENTINEL_VMA_GATE)
+ return get_gate_vma(mm);
- vma_stop(priv);
- return NULL;
+ return proc_get_vma(m, ppos);
}
-static void *m_next(struct seq_file *m, void *v, loff_t *pos)
+static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
{
- struct proc_maps_private *priv = m->private;
- struct vm_area_struct *next;
-
- (*pos)++;
- next = m_next_vma(priv, v);
- if (!next)
- vma_stop(priv);
- return next;
+ if (*ppos == SENTINEL_VMA_GATE) {
+ *ppos = SENTINEL_VMA_END;
+ return NULL;
+ }
+ return proc_get_vma(m, ppos);
}
static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
+ struct mm_struct *mm = priv->lock_ctx.mm;
- if (!IS_ERR_OR_NULL(v))
- vma_stop(priv);
- if (priv->task) {
- put_task_struct(priv->task);
- priv->task = NULL;
- }
+ if (!priv->task)
+ return;
+
+ release_task_mempolicy(priv);
+ unlock_vma_range(&priv->lock_ctx);
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
}
static int proc_maps_open(struct inode *inode, struct file *file,
@@ -233,9 +349,9 @@ static int proc_maps_open(struct inode *inode, struct file *file,
return -ENOMEM;
priv->inode = inode;
- priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(priv->mm)) {
- int err = PTR_ERR(priv->mm);
+ priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(priv->lock_ctx.mm)) {
+ int err = PTR_ERR(priv->lock_ctx.mm);
seq_release_private(inode, file);
return err;
@@ -249,8 +365,8 @@ static int proc_map_release(struct inode *inode, struct file *file)
struct seq_file *seq = file->private_data;
struct proc_maps_private *priv = seq->private;
- if (priv->mm)
- mmdrop(priv->mm);
+ if (priv->lock_ctx.mm)
+ mmdrop(priv->lock_ctx.mm);
return seq_release_private(inode, file);
}
@@ -262,37 +378,101 @@ static int do_maps_open(struct inode *inode, struct file *file,
sizeof(struct proc_maps_private));
}
-/*
- * Indicate if the VMA is a stack for the given task; for
- * /proc/PID/maps that is the stack of the main task.
- */
-static int is_stack(struct proc_maps_private *priv,
- struct vm_area_struct *vma)
+static void get_vma_name(struct vm_area_struct *vma,
+ const struct path **path,
+ const char **name,
+ const char **name_fmt)
{
+ struct anon_vma_name *anon_name = vma->vm_mm ? anon_vma_name(vma) : NULL;
+
+ *name = NULL;
+ *path = NULL;
+ *name_fmt = NULL;
+
/*
- * We make no effort to guess what a given thread considers to be
- * its "stack". It's not even well-defined for programs written
- * languages like Go.
+ * Print the dentry name for named mappings, and a
+ * special [heap] marker for the heap:
*/
- return vma->vm_start <= vma->vm_mm->start_stack &&
- vma->vm_end >= vma->vm_mm->start_stack;
+ if (vma->vm_file) {
+ /*
+ * If user named this anon shared memory via
+ * prctl(PR_SET_VMA ..., use the provided name.
+ */
+ if (anon_name) {
+ *name_fmt = "[anon_shmem:%s]";
+ *name = anon_name->name;
+ } else {
+ *path = file_user_path(vma->vm_file);
+ }
+ return;
+ }
+
+ if (vma->vm_ops && vma->vm_ops->name) {
+ *name = vma->vm_ops->name(vma);
+ if (*name)
+ return;
+ }
+
+ *name = arch_vma_name(vma);
+ if (*name)
+ return;
+
+ if (!vma->vm_mm) {
+ *name = "[vdso]";
+ return;
+ }
+
+ if (vma_is_initial_heap(vma)) {
+ *name = "[heap]";
+ return;
+ }
+
+ if (vma_is_initial_stack(vma)) {
+ *name = "[stack]";
+ return;
+ }
+
+ if (anon_name) {
+ *name_fmt = "[anon:%s]";
+ *name = anon_name->name;
+ return;
+ }
+}
+
+static void show_vma_header_prefix(struct seq_file *m,
+ unsigned long start, unsigned long end,
+ vm_flags_t flags, unsigned long long pgoff,
+ dev_t dev, unsigned long ino)
+{
+ seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
+ seq_put_hex_ll(m, NULL, start, 8);
+ seq_put_hex_ll(m, "-", end, 8);
+ seq_putc(m, ' ');
+ seq_putc(m, flags & VM_READ ? 'r' : '-');
+ seq_putc(m, flags & VM_WRITE ? 'w' : '-');
+ seq_putc(m, flags & VM_EXEC ? 'x' : '-');
+ seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p');
+ seq_put_hex_ll(m, " ", pgoff, 8);
+ seq_put_hex_ll(m, " ", MAJOR(dev), 2);
+ seq_put_hex_ll(m, ":", MINOR(dev), 2);
+ seq_put_decimal_ull(m, " ", ino);
+ seq_putc(m, ' ');
}
static void
-show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
+show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
{
- struct mm_struct *mm = vma->vm_mm;
- struct file *file = vma->vm_file;
- struct proc_maps_private *priv = m->private;
+ const struct path *path;
+ const char *name_fmt, *name;
vm_flags_t flags = vma->vm_flags;
unsigned long ino = 0;
unsigned long long pgoff = 0;
unsigned long start, end;
dev_t dev = 0;
- const char *name = NULL;
- if (file) {
- struct inode *inode = file_inode(vma->vm_file);
+ if (vma->vm_file) {
+ const struct inode *inode = file_user_inode(vma->vm_file);
+
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
@@ -300,98 +480,340 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
start = vma->vm_start;
end = vma->vm_end;
+ show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
- seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
- seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
- start,
- end,
- flags & VM_READ ? 'r' : '-',
- flags & VM_WRITE ? 'w' : '-',
- flags & VM_EXEC ? 'x' : '-',
- flags & VM_MAYSHARE ? 's' : 'p',
- pgoff,
- MAJOR(dev), MINOR(dev), ino);
-
- /*
- * Print the dentry name for named mappings, and a
- * special [heap] marker for the heap:
- */
- if (file) {
+ get_vma_name(vma, &path, &name, &name_fmt);
+ if (path) {
+ seq_pad(m, ' ');
+ seq_path(m, path, "\n");
+ } else if (name_fmt) {
seq_pad(m, ' ');
- seq_file_path(m, file, "\n");
- goto done;
+ seq_printf(m, name_fmt, name);
+ } else if (name) {
+ seq_pad(m, ' ');
+ seq_puts(m, name);
}
+ seq_putc(m, '\n');
+}
- if (vma->vm_ops && vma->vm_ops->name) {
- name = vma->vm_ops->name(vma);
- if (name)
- goto done;
+static int show_map(struct seq_file *m, void *v)
+{
+ show_map_vma(m, v);
+ return 0;
+}
+
+static const struct seq_operations proc_pid_maps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_map
+};
+
+static int pid_maps_open(struct inode *inode, struct file *file)
+{
+ return do_maps_open(inode, file, &proc_pid_maps_op);
+}
+
+#define PROCMAP_QUERY_VMA_FLAGS ( \
+ PROCMAP_QUERY_VMA_READABLE | \
+ PROCMAP_QUERY_VMA_WRITABLE | \
+ PROCMAP_QUERY_VMA_EXECUTABLE | \
+ PROCMAP_QUERY_VMA_SHARED \
+)
+
+#define PROCMAP_QUERY_VALID_FLAGS_MASK ( \
+ PROCMAP_QUERY_COVERING_OR_NEXT_VMA | \
+ PROCMAP_QUERY_FILE_BACKED_VMA | \
+ PROCMAP_QUERY_VMA_FLAGS \
+)
+
+#ifdef CONFIG_PER_VMA_LOCK
+
+static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx)
+{
+ reset_lock_ctx(lock_ctx);
+
+ return 0;
+}
+
+static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx)
+{
+ if (lock_ctx->mmap_locked) {
+ mmap_read_unlock(lock_ctx->mm);
+ lock_ctx->mmap_locked = false;
+ } else {
+ unlock_ctx_vma(lock_ctx);
}
+}
- name = arch_vma_name(vma);
- if (!name) {
- if (!mm) {
- name = "[vdso]";
- goto done;
- }
+static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx,
+ unsigned long addr)
+{
+ struct mm_struct *mm = lock_ctx->mm;
+ struct vm_area_struct *vma;
+ struct vma_iterator vmi;
- if (vma->vm_start <= mm->brk &&
- vma->vm_end >= mm->start_brk) {
- name = "[heap]";
- goto done;
- }
+ if (lock_ctx->mmap_locked)
+ return find_vma(mm, addr);
- if (is_stack(priv, vma))
- name = "[stack]";
+ /* Unlock previously locked VMA and find the next one under RCU */
+ unlock_ctx_vma(lock_ctx);
+ rcu_read_lock();
+ vma_iter_init(&vmi, mm, addr);
+ vma = lock_next_vma(mm, &vmi, addr);
+ rcu_read_unlock();
+
+ if (!vma)
+ return NULL;
+
+ if (!IS_ERR(vma)) {
+ lock_ctx->locked_vma = vma;
+ return vma;
}
-done:
- if (name) {
- seq_pad(m, ' ');
- seq_puts(m, name);
+ if (PTR_ERR(vma) == -EAGAIN) {
+ /* Fallback to mmap_lock on vma->vm_refcnt overflow */
+ mmap_read_lock(mm);
+ vma = find_vma(mm, addr);
+ lock_ctx->mmap_locked = true;
}
- seq_putc(m, '\n');
+
+ return vma;
}
-static int show_map(struct seq_file *m, void *v, int is_pid)
+#else /* CONFIG_PER_VMA_LOCK */
+
+static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx)
{
- show_map_vma(m, v, is_pid);
- m_cache_vma(m, v);
- return 0;
+ return mmap_read_lock_killable(lock_ctx->mm);
}
-static int show_pid_map(struct seq_file *m, void *v)
+static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx)
{
- return show_map(m, v, 1);
+ mmap_read_unlock(lock_ctx->mm);
}
-static int show_tid_map(struct seq_file *m, void *v)
+static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx,
+ unsigned long addr)
{
- return show_map(m, v, 0);
+ return find_vma(lock_ctx->mm, addr);
}
-static const struct seq_operations proc_pid_maps_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = show_pid_map
-};
+#endif /* CONFIG_PER_VMA_LOCK */
-static const struct seq_operations proc_tid_maps_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = show_tid_map
-};
+static struct vm_area_struct *query_matching_vma(struct proc_maps_locking_ctx *lock_ctx,
+ unsigned long addr, u32 flags)
+{
+ struct vm_area_struct *vma;
-static int pid_maps_open(struct inode *inode, struct file *file)
+next_vma:
+ vma = query_vma_find_by_addr(lock_ctx, addr);
+ if (IS_ERR(vma))
+ return vma;
+
+ if (!vma)
+ goto no_vma;
+
+ /* user requested only file-backed VMA, keep iterating */
+ if ((flags & PROCMAP_QUERY_FILE_BACKED_VMA) && !vma->vm_file)
+ goto skip_vma;
+
+ /* VMA permissions should satisfy query flags */
+ if (flags & PROCMAP_QUERY_VMA_FLAGS) {
+ u32 perm = 0;
+
+ if (flags & PROCMAP_QUERY_VMA_READABLE)
+ perm |= VM_READ;
+ if (flags & PROCMAP_QUERY_VMA_WRITABLE)
+ perm |= VM_WRITE;
+ if (flags & PROCMAP_QUERY_VMA_EXECUTABLE)
+ perm |= VM_EXEC;
+ if (flags & PROCMAP_QUERY_VMA_SHARED)
+ perm |= VM_MAYSHARE;
+
+ if ((vma->vm_flags & perm) != perm)
+ goto skip_vma;
+ }
+
+ /* found covering VMA or user is OK with the matching next VMA */
+ if ((flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) || vma->vm_start <= addr)
+ return vma;
+
+skip_vma:
+ /*
+ * If the user needs closest matching VMA, keep iterating.
+ */
+ addr = vma->vm_end;
+ if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA)
+ goto next_vma;
+
+no_vma:
+ return ERR_PTR(-ENOENT);
+}
+
+static int do_procmap_query(struct mm_struct *mm, void __user *uarg)
{
- return do_maps_open(inode, file, &proc_pid_maps_op);
+ struct proc_maps_locking_ctx lock_ctx = { .mm = mm };
+ struct procmap_query karg;
+ struct vm_area_struct *vma;
+ const char *name = NULL;
+ char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL;
+ __u64 usize;
+ int err;
+
+ if (copy_from_user(&usize, (void __user *)uarg, sizeof(usize)))
+ return -EFAULT;
+ /* argument struct can never be that large, reject abuse */
+ if (usize > PAGE_SIZE)
+ return -E2BIG;
+ /* argument struct should have at least query_flags and query_addr fields */
+ if (usize < offsetofend(struct procmap_query, query_addr))
+ return -EINVAL;
+ err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize);
+ if (err)
+ return err;
+
+ /* reject unknown flags */
+ if (karg.query_flags & ~PROCMAP_QUERY_VALID_FLAGS_MASK)
+ return -EINVAL;
+ /* either both buffer address and size are set, or both should be zero */
+ if (!!karg.vma_name_size != !!karg.vma_name_addr)
+ return -EINVAL;
+ if (!!karg.build_id_size != !!karg.build_id_addr)
+ return -EINVAL;
+
+ if (!mm || !mmget_not_zero(mm))
+ return -ESRCH;
+
+ err = query_vma_setup(&lock_ctx);
+ if (err) {
+ mmput(mm);
+ return err;
+ }
+
+ vma = query_matching_vma(&lock_ctx, karg.query_addr, karg.query_flags);
+ if (IS_ERR(vma)) {
+ err = PTR_ERR(vma);
+ vma = NULL;
+ goto out;
+ }
+
+ karg.vma_start = vma->vm_start;
+ karg.vma_end = vma->vm_end;
+
+ karg.vma_flags = 0;
+ if (vma->vm_flags & VM_READ)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_READABLE;
+ if (vma->vm_flags & VM_WRITE)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_WRITABLE;
+ if (vma->vm_flags & VM_EXEC)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_EXECUTABLE;
+ if (vma->vm_flags & VM_MAYSHARE)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_SHARED;
+
+ karg.vma_page_size = vma_kernel_pagesize(vma);
+
+ if (vma->vm_file) {
+ const struct inode *inode = file_user_inode(vma->vm_file);
+
+ karg.vma_offset = ((__u64)vma->vm_pgoff) << PAGE_SHIFT;
+ karg.dev_major = MAJOR(inode->i_sb->s_dev);
+ karg.dev_minor = MINOR(inode->i_sb->s_dev);
+ karg.inode = inode->i_ino;
+ } else {
+ karg.vma_offset = 0;
+ karg.dev_major = 0;
+ karg.dev_minor = 0;
+ karg.inode = 0;
+ }
+
+ if (karg.build_id_size) {
+ __u32 build_id_sz;
+
+ err = build_id_parse(vma, build_id_buf, &build_id_sz);
+ if (err) {
+ karg.build_id_size = 0;
+ } else {
+ if (karg.build_id_size < build_id_sz) {
+ err = -ENAMETOOLONG;
+ goto out;
+ }
+ karg.build_id_size = build_id_sz;
+ }
+ }
+
+ if (karg.vma_name_size) {
+ size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size);
+ const struct path *path;
+ const char *name_fmt;
+ size_t name_sz = 0;
+
+ get_vma_name(vma, &path, &name, &name_fmt);
+
+ if (path || name_fmt || name) {
+ name_buf = kmalloc(name_buf_sz, GFP_KERNEL);
+ if (!name_buf) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+ if (path) {
+ name = d_path(path, name_buf, name_buf_sz);
+ if (IS_ERR(name)) {
+ err = PTR_ERR(name);
+ goto out;
+ }
+ name_sz = name_buf + name_buf_sz - name;
+ } else if (name || name_fmt) {
+ name_sz = 1 + snprintf(name_buf, name_buf_sz, name_fmt ?: "%s", name);
+ name = name_buf;
+ }
+ if (name_sz > name_buf_sz) {
+ err = -ENAMETOOLONG;
+ goto out;
+ }
+ karg.vma_name_size = name_sz;
+ }
+
+ /* unlock vma or mmap_lock, and put mm_struct before copying data to user */
+ query_vma_teardown(&lock_ctx);
+ mmput(mm);
+
+ if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr),
+ name, karg.vma_name_size)) {
+ kfree(name_buf);
+ return -EFAULT;
+ }
+ kfree(name_buf);
+
+ if (karg.build_id_size && copy_to_user(u64_to_user_ptr(karg.build_id_addr),
+ build_id_buf, karg.build_id_size))
+ return -EFAULT;
+
+ if (copy_to_user(uarg, &karg, min_t(size_t, sizeof(karg), usize)))
+ return -EFAULT;
+
+ return 0;
+
+out:
+ query_vma_teardown(&lock_ctx);
+ mmput(mm);
+ kfree(name_buf);
+ return err;
}
-static int tid_maps_open(struct inode *inode, struct file *file)
+static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
- return do_maps_open(inode, file, &proc_tid_maps_op);
+ struct seq_file *seq = file->private_data;
+ struct proc_maps_private *priv = seq->private;
+
+ switch (cmd) {
+ case PROCMAP_QUERY:
+ /* priv->lock_ctx.mm is set during file open operation */
+ return do_procmap_query(priv->lock_ctx.mm, (void __user *)arg);
+ default:
+ return -ENOIOCTLCMD;
+ }
}
const struct file_operations proc_pid_maps_operations = {
@@ -399,13 +821,8 @@ const struct file_operations proc_pid_maps_operations = {
.read = seq_read,
.llseek = seq_lseek,
.release = proc_map_release,
-};
-
-const struct file_operations proc_tid_maps_operations = {
- .open = tid_maps_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = proc_map_release,
+ .unlocked_ioctl = procfs_procmap_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
};
/*
@@ -439,94 +856,177 @@ struct mem_size_stats {
unsigned long lazyfree;
unsigned long anonymous_thp;
unsigned long shmem_thp;
+ unsigned long file_thp;
unsigned long swap;
unsigned long shared_hugetlb;
unsigned long private_hugetlb;
+ unsigned long ksm;
u64 pss;
+ u64 pss_anon;
+ u64 pss_file;
+ u64 pss_shmem;
+ u64 pss_dirty;
+ u64 pss_locked;
u64 swap_pss;
- bool check_shmem_swap;
};
+static void smaps_page_accumulate(struct mem_size_stats *mss,
+ struct folio *folio, unsigned long size, unsigned long pss,
+ bool dirty, bool locked, bool private)
+{
+ mss->pss += pss;
+
+ if (folio_test_anon(folio))
+ mss->pss_anon += pss;
+ else if (folio_test_swapbacked(folio))
+ mss->pss_shmem += pss;
+ else
+ mss->pss_file += pss;
+
+ if (locked)
+ mss->pss_locked += pss;
+
+ if (dirty || folio_test_dirty(folio)) {
+ mss->pss_dirty += pss;
+ if (private)
+ mss->private_dirty += size;
+ else
+ mss->shared_dirty += size;
+ } else {
+ if (private)
+ mss->private_clean += size;
+ else
+ mss->shared_clean += size;
+ }
+}
+
static void smaps_account(struct mem_size_stats *mss, struct page *page,
- bool compound, bool young, bool dirty)
+ bool compound, bool young, bool dirty, bool locked,
+ bool present)
{
- int i, nr = compound ? 1 << compound_order(page) : 1;
+ struct folio *folio = page_folio(page);
+ int i, nr = compound ? compound_nr(page) : 1;
unsigned long size = nr * PAGE_SIZE;
+ bool exclusive;
+ int mapcount;
- if (PageAnon(page)) {
+ /*
+ * First accumulate quantities that depend only on |size| and the type
+ * of the compound page.
+ */
+ if (folio_test_anon(folio)) {
mss->anonymous += size;
- if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
+ if (!folio_test_swapbacked(folio) && !dirty &&
+ !folio_test_dirty(folio))
mss->lazyfree += size;
}
+ if (folio_test_ksm(folio))
+ mss->ksm += size;
+
mss->resident += size;
/* Accumulate the size in pages that have been accessed. */
- if (young || page_is_young(page) || PageReferenced(page))
+ if (young || folio_test_young(folio) || folio_test_referenced(folio))
mss->referenced += size;
/*
- * page_count(page) == 1 guarantees the page is mapped exactly once.
- * If any subpage of the compound page mapped with PTE it would elevate
- * page_count().
+ * Then accumulate quantities that may depend on sharing, or that may
+ * differ page-by-page.
+ *
+ * refcount == 1 for present entries guarantees that the folio is mapped
+ * exactly once. For large folios this implies that exactly one
+ * PTE/PMD/... maps (a part of) this folio.
+ *
+ * Treat all non-present entries (where relying on the mapcount and
+ * refcount doesn't make sense) as "maybe shared, but not sure how
+ * often". We treat device private entries as being fake-present.
+ *
+ * Note that it would not be safe to read the mapcount especially for
+ * pages referenced by migration entries, even with the PTL held.
*/
- if (page_count(page) == 1) {
- if (dirty || PageDirty(page))
- mss->private_dirty += size;
- else
- mss->private_clean += size;
- mss->pss += (u64)size << PSS_SHIFT;
+ if (folio_ref_count(folio) == 1 || !present) {
+ smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT,
+ dirty, locked, present);
return;
}
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ mapcount = folio_average_page_mapcount(folio);
+ exclusive = !folio_maybe_mapped_shared(folio);
+ }
+
+ /*
+ * We obtain a snapshot of the mapcount. Without holding the folio lock
+ * this snapshot can be slightly wrong as we cannot always read the
+ * mapcount atomically.
+ */
for (i = 0; i < nr; i++, page++) {
- int mapcount = page_mapcount(page);
+ unsigned long pss = PAGE_SIZE << PSS_SHIFT;
- if (mapcount >= 2) {
- if (dirty || PageDirty(page))
- mss->shared_dirty += PAGE_SIZE;
- else
- mss->shared_clean += PAGE_SIZE;
- mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
- } else {
- if (dirty || PageDirty(page))
- mss->private_dirty += PAGE_SIZE;
- else
- mss->private_clean += PAGE_SIZE;
- mss->pss += PAGE_SIZE << PSS_SHIFT;
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) {
+ mapcount = folio_precise_page_mapcount(folio, page);
+ exclusive = mapcount < 2;
}
+
+ if (mapcount >= 2)
+ pss /= mapcount;
+ smaps_page_accumulate(mss, folio, PAGE_SIZE, pss,
+ dirty, locked, exclusive);
}
}
#ifdef CONFIG_SHMEM
static int smaps_pte_hole(unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+ __always_unused int depth, struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
- mss->swap += shmem_partial_swap_usage(
- walk->vma->vm_file->f_mapping, addr, end);
+ mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping,
+ linear_page_index(vma, addr),
+ linear_page_index(vma, end));
return 0;
}
+#else
+#define smaps_pte_hole NULL
+#endif /* CONFIG_SHMEM */
+
+static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk)
+{
+#ifdef CONFIG_SHMEM
+ if (walk->ops->pte_hole) {
+ /* depth is not used */
+ smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk);
+ }
#endif
+}
static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
+ bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
+ bool present = false, young = false, dirty = false;
+ pte_t ptent = ptep_get(pte);
- if (pte_present(*pte)) {
- page = vm_normal_page(vma, addr, *pte);
- } else if (is_swap_pte(*pte)) {
- swp_entry_t swpent = pte_to_swp_entry(*pte);
+ if (pte_present(ptent)) {
+ page = vm_normal_page(vma, addr, ptent);
+ young = pte_young(ptent);
+ dirty = pte_dirty(ptent);
+ present = true;
+ } else if (pte_none(ptent)) {
+ smaps_pte_hole_lookup(addr, walk);
+ } else {
+ const softleaf_t entry = softleaf_from_pte(ptent);
- if (!non_swap_entry(swpent)) {
+ if (softleaf_is_swap(entry)) {
int mapcount;
mss->swap += PAGE_SIZE;
- mapcount = swp_swapcount(swpent);
+ mapcount = swp_swapcount(entry);
if (mapcount >= 2) {
u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
@@ -535,27 +1035,17 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else {
mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
}
- } else if (is_migration_entry(swpent))
- page = migration_entry_to_page(swpent);
- } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
- && pte_none(*pte))) {
- page = find_get_entry(vma->vm_file->f_mapping,
- linear_page_index(vma, addr));
- if (!page)
- return;
-
- if (radix_tree_exceptional_entry(page))
- mss->swap += PAGE_SIZE;
- else
- put_page(page);
-
- return;
+ } else if (softleaf_has_pfn(entry)) {
+ if (softleaf_is_device_private(entry))
+ present = true;
+ page = softleaf_to_page(entry);
+ }
}
if (!page)
return;
- smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
+ smaps_account(mss, page, false, young, dirty, locked, present);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -564,21 +1054,36 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
- struct page *page;
+ bool locked = !!(vma->vm_flags & VM_LOCKED);
+ struct page *page = NULL;
+ bool present = false;
+ struct folio *folio;
- /* FOLL_DUMP will return -EFAULT on huge zero page */
- page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
+ if (pmd_none(*pmd))
+ return;
+ if (pmd_present(*pmd)) {
+ page = vm_normal_page_pmd(vma, addr, *pmd);
+ present = true;
+ } else if (unlikely(thp_migration_supported())) {
+ const softleaf_t entry = softleaf_from_pmd(*pmd);
+
+ if (softleaf_has_pfn(entry))
+ page = softleaf_to_page(entry);
+ }
if (IS_ERR_OR_NULL(page))
return;
- if (PageAnon(page))
+ folio = page_folio(page);
+ if (folio_test_anon(folio))
mss->anonymous_thp += HPAGE_PMD_SIZE;
- else if (PageSwapBacked(page))
+ else if (folio_test_swapbacked(folio))
mss->shmem_thp += HPAGE_PMD_SIZE;
- else if (is_zone_device_page(page))
+ else if (folio_is_zone_device(folio))
/* pass */;
else
- VM_BUG_ON_PAGE(1, page);
- smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
+ mss->file_thp += HPAGE_PMD_SIZE;
+
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
+ locked, present);
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -598,20 +1103,18 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (ptl) {
smaps_pmd_entry(pmd, addr, walk);
spin_unlock(ptl);
- return 0;
+ goto out;
}
- if (pmd_trans_unstable(pmd))
- return 0;
- /*
- * The mmap_sem held all the way back in m_start() is what
- * keeps khugepaged out of here and from collapsing things
- * in here.
- */
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
for (; addr != end; pte++, addr += PAGE_SIZE)
smaps_pte_entry(pte, addr, walk);
pte_unmap_unlock(pte - 1, ptl);
+out:
cond_resched();
return 0;
}
@@ -620,8 +1123,15 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
{
/*
* Don't forget to update Documentation/ on changes.
+ *
+ * The length of the second argument of mnemonics[]
+ * needs to be 3 instead of previously set 2
+ * (i.e. from [BITS_PER_LONG][2] to [BITS_PER_LONG][3])
+ * to avoid spurious
+ * -Werror=unterminated-string-initialization warning
+ * with GCC 15
*/
- static const char mnemonics[BITS_PER_LONG][2] = {
+ static const char mnemonics[BITS_PER_LONG][3] = {
/*
* In case if we meet a flag we don't know about.
*/
@@ -637,21 +1147,24 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_MAYSHARE)] = "ms",
[ilog2(VM_GROWSDOWN)] = "gd",
[ilog2(VM_PFNMAP)] = "pf",
- [ilog2(VM_DENYWRITE)] = "dw",
-#ifdef CONFIG_X86_INTEL_MPX
- [ilog2(VM_MPX)] = "mp",
-#endif
+ [ilog2(VM_MAYBE_GUARD)] = "gu",
[ilog2(VM_LOCKED)] = "lo",
[ilog2(VM_IO)] = "io",
[ilog2(VM_SEQ_READ)] = "sr",
[ilog2(VM_RAND_READ)] = "rr",
[ilog2(VM_DONTCOPY)] = "dc",
[ilog2(VM_DONTEXPAND)] = "de",
+ [ilog2(VM_LOCKONFAULT)] = "lf",
[ilog2(VM_ACCOUNT)] = "ac",
[ilog2(VM_NORESERVE)] = "nr",
[ilog2(VM_HUGETLB)] = "ht",
+ [ilog2(VM_SYNC)] = "sf",
[ilog2(VM_ARCH_1)] = "ar",
+ [ilog2(VM_WIPEONFORK)] = "wf",
[ilog2(VM_DONTDUMP)] = "dd",
+#ifdef CONFIG_ARM64_BTI
+ [ilog2(VM_ARM64_BTI)] = "bt",
+#endif
#ifdef CONFIG_MEM_SOFT_DIRTY
[ilog2(VM_SOFTDIRTY)] = "sd",
#endif
@@ -661,13 +1174,34 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_MERGEABLE)] = "mg",
[ilog2(VM_UFFD_MISSING)]= "um",
[ilog2(VM_UFFD_WP)] = "uw",
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+#ifdef CONFIG_ARM64_MTE
+ [ilog2(VM_MTE)] = "mt",
+ [ilog2(VM_MTE_ALLOWED)] = "",
+#endif
+#ifdef CONFIG_ARCH_HAS_PKEYS
/* These come out via ProtectionKey: */
[ilog2(VM_PKEY_BIT0)] = "",
[ilog2(VM_PKEY_BIT1)] = "",
[ilog2(VM_PKEY_BIT2)] = "",
+#if CONFIG_ARCH_PKEY_BITS > 3
[ilog2(VM_PKEY_BIT3)] = "",
#endif
+#if CONFIG_ARCH_PKEY_BITS > 4
+ [ilog2(VM_PKEY_BIT4)] = "",
+#endif
+#endif /* CONFIG_ARCH_HAS_PKEYS */
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+ [ilog2(VM_UFFD_MINOR)] = "ui",
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK
+ [ilog2(VM_SHADOW_STACK)] = "ss",
+#endif
+#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
+ [ilog2(VM_DROPPABLE)] = "dp",
+#endif
+#ifdef CONFIG_64BIT
+ [ilog2(VM_SEALED)] = "sl",
+#endif
};
size_t i;
@@ -675,10 +1209,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
for (i = 0; i < BITS_PER_LONG; i++) {
if (!mnemonics[i][0])
continue;
- if (vma->vm_flags & (1UL << i)) {
- seq_printf(m, "%c%c ",
- mnemonics[i][0], mnemonics[i][1]);
- }
+ if (vma->vm_flags & (1UL << i))
+ seq_printf(m, "%s ", mnemonics[i]);
}
seq_putc(m, '\n');
}
@@ -690,48 +1222,66 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
- struct page *page = NULL;
+ struct folio *folio = NULL;
+ bool present = false;
+ spinlock_t *ptl;
+ pte_t ptent;
- if (pte_present(*pte)) {
- page = vm_normal_page(vma, addr, *pte);
- } else if (is_swap_pte(*pte)) {
- swp_entry_t swpent = pte_to_swp_entry(*pte);
+ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
+ ptent = huge_ptep_get(walk->mm, addr, pte);
+ if (pte_present(ptent)) {
+ folio = page_folio(pte_page(ptent));
+ present = true;
+ } else {
+ const softleaf_t entry = softleaf_from_pte(ptent);
- if (is_migration_entry(swpent))
- page = migration_entry_to_page(swpent);
+ if (softleaf_has_pfn(entry))
+ folio = softleaf_to_folio(entry);
}
- if (page) {
- int mapcount = page_mapcount(page);
- if (mapcount >= 2)
+ if (folio) {
+ /* We treat non-present entries as "maybe shared". */
+ if (!present || folio_maybe_mapped_shared(folio) ||
+ hugetlb_pmd_shared(pte))
mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
else
mss->private_hugetlb += huge_page_size(hstate_vma(vma));
}
+ spin_unlock(ptl);
return 0;
}
+#else
+#define smaps_hugetlb_range NULL
#endif /* HUGETLB_PAGE */
-void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
-{
-}
+static const struct mm_walk_ops smaps_walk_ops = {
+ .pmd_entry = smaps_pte_range,
+ .hugetlb_entry = smaps_hugetlb_range,
+ .walk_lock = PGWALK_RDLOCK,
+};
-static int show_smap(struct seq_file *m, void *v, int is_pid)
+static const struct mm_walk_ops smaps_shmem_walk_ops = {
+ .pmd_entry = smaps_pte_range,
+ .hugetlb_entry = smaps_hugetlb_range,
+ .pte_hole = smaps_pte_hole,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+/*
+ * Gather mem stats from @vma with the indicated beginning
+ * address @start, and keep them in @mss.
+ *
+ * Use vm_start of @vma as the beginning address if @start is 0.
+ */
+static void smap_gather_stats(struct vm_area_struct *vma,
+ struct mem_size_stats *mss, unsigned long start)
{
- struct vm_area_struct *vma = v;
- struct mem_size_stats mss;
- struct mm_walk smaps_walk = {
- .pmd_entry = smaps_pte_range,
-#ifdef CONFIG_HUGETLB_PAGE
- .hugetlb_entry = smaps_hugetlb_range,
-#endif
- .mm = vma->vm_mm,
- .private = &mss,
- };
+ const struct mm_walk_ops *ops = &smaps_walk_ops;
- memset(&mss, 0, sizeof mss);
+ /* Invalid start */
+ if (start >= vma->vm_end)
+ return;
-#ifdef CONFIG_SHMEM
if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
/*
* For shared or readonly shmem mappings we know that all
@@ -745,90 +1295,220 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
*/
unsigned long shmem_swapped = shmem_swap_usage(vma);
- if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
- !(vma->vm_flags & VM_WRITE)) {
- mss.swap = shmem_swapped;
+ if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
+ !(vma->vm_flags & VM_WRITE))) {
+ mss->swap += shmem_swapped;
} else {
- mss.check_shmem_swap = true;
- smaps_walk.pte_hole = smaps_pte_hole;
+ ops = &smaps_shmem_walk_ops;
}
}
-#endif
- /* mmap_sem is held in m_start */
- walk_page_vma(vma, &smaps_walk);
-
- show_map_vma(m, vma, is_pid);
-
- seq_printf(m,
- "Size: %8lu kB\n"
- "Rss: %8lu kB\n"
- "Pss: %8lu kB\n"
- "Shared_Clean: %8lu kB\n"
- "Shared_Dirty: %8lu kB\n"
- "Private_Clean: %8lu kB\n"
- "Private_Dirty: %8lu kB\n"
- "Referenced: %8lu kB\n"
- "Anonymous: %8lu kB\n"
- "LazyFree: %8lu kB\n"
- "AnonHugePages: %8lu kB\n"
- "ShmemPmdMapped: %8lu kB\n"
- "Shared_Hugetlb: %8lu kB\n"
- "Private_Hugetlb: %7lu kB\n"
- "Swap: %8lu kB\n"
- "SwapPss: %8lu kB\n"
- "KernelPageSize: %8lu kB\n"
- "MMUPageSize: %8lu kB\n"
- "Locked: %8lu kB\n",
- (vma->vm_end - vma->vm_start) >> 10,
- mss.resident >> 10,
- (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
- mss.shared_clean >> 10,
- mss.shared_dirty >> 10,
- mss.private_clean >> 10,
- mss.private_dirty >> 10,
- mss.referenced >> 10,
- mss.anonymous >> 10,
- mss.lazyfree >> 10,
- mss.anonymous_thp >> 10,
- mss.shmem_thp >> 10,
- mss.shared_hugetlb >> 10,
- mss.private_hugetlb >> 10,
- mss.swap >> 10,
- (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
- vma_kernel_pagesize(vma) >> 10,
- vma_mmu_pagesize(vma) >> 10,
- (vma->vm_flags & VM_LOCKED) ?
- (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
-
- arch_show_smap(m, vma);
- show_smap_vma_flags(m, vma);
- m_cache_vma(m, vma);
- return 0;
+ /* mmap_lock is held in m_start */
+ if (!start)
+ walk_page_vma(vma, ops, mss);
+ else
+ walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
}
-static int show_pid_smap(struct seq_file *m, void *v)
+#define SEQ_PUT_DEC(str, val) \
+ seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
+
+/* Show the contents common for smaps and smaps_rollup */
+static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
+ bool rollup_mode)
{
- return show_smap(m, v, 1);
+ SEQ_PUT_DEC("Rss: ", mss->resident);
+ SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT);
+ SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT);
+ if (rollup_mode) {
+ /*
+ * These are meaningful only for smaps_rollup, otherwise two of
+ * them are zero, and the other one is the same as Pss.
+ */
+ SEQ_PUT_DEC(" kB\nPss_Anon: ",
+ mss->pss_anon >> PSS_SHIFT);
+ SEQ_PUT_DEC(" kB\nPss_File: ",
+ mss->pss_file >> PSS_SHIFT);
+ SEQ_PUT_DEC(" kB\nPss_Shmem: ",
+ mss->pss_shmem >> PSS_SHIFT);
+ }
+ SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean);
+ SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty);
+ SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean);
+ SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty);
+ SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced);
+ SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous);
+ SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm);
+ SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
+ SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
+ SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
+ SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
+ SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
+ seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
+ mss->private_hugetlb >> 10, 7);
+ SEQ_PUT_DEC(" kB\nSwap: ", mss->swap);
+ SEQ_PUT_DEC(" kB\nSwapPss: ",
+ mss->swap_pss >> PSS_SHIFT);
+ SEQ_PUT_DEC(" kB\nLocked: ",
+ mss->pss_locked >> PSS_SHIFT);
+ seq_puts(m, " kB\n");
+}
+
+static int show_smap(struct seq_file *m, void *v)
+{
+ struct vm_area_struct *vma = v;
+ struct mem_size_stats mss = {};
+
+ smap_gather_stats(vma, &mss, 0);
+
+ show_map_vma(m, vma);
+
+ SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start);
+ SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma));
+ SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma));
+ seq_puts(m, " kB\n");
+
+ __show_smap(m, &mss, false);
+
+ seq_printf(m, "THPeligible: %8u\n",
+ !!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS,
+ THP_ORDERS_ALL));
+
+ if (arch_pkeys_enabled())
+ seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
+ show_smap_vma_flags(m, vma);
+
+ return 0;
}
-static int show_tid_smap(struct seq_file *m, void *v)
+static int show_smaps_rollup(struct seq_file *m, void *v)
{
- return show_smap(m, v, 0);
+ struct proc_maps_private *priv = m->private;
+ struct mem_size_stats mss = {};
+ struct mm_struct *mm = priv->lock_ctx.mm;
+ struct vm_area_struct *vma;
+ unsigned long vma_start = 0, last_vma_end = 0;
+ int ret = 0;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ priv->task = get_proc_task(priv->inode);
+ if (!priv->task)
+ return -ESRCH;
+
+ if (!mm || !mmget_not_zero(mm)) {
+ ret = -ESRCH;
+ goto out_put_task;
+ }
+
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ goto out_put_mm;
+
+ hold_task_mempolicy(priv);
+ vma = vma_next(&vmi);
+
+ if (unlikely(!vma))
+ goto empty_set;
+
+ vma_start = vma->vm_start;
+ do {
+ smap_gather_stats(vma, &mss, 0);
+ last_vma_end = vma->vm_end;
+
+ /*
+ * Release mmap_lock temporarily if someone wants to
+ * access it for write request.
+ */
+ if (mmap_lock_is_contended(mm)) {
+ vma_iter_invalidate(&vmi);
+ mmap_read_unlock(mm);
+ ret = mmap_read_lock_killable(mm);
+ if (ret) {
+ release_task_mempolicy(priv);
+ goto out_put_mm;
+ }
+
+ /*
+ * After dropping the lock, there are four cases to
+ * consider. See the following example for explanation.
+ *
+ * +------+------+-----------+
+ * | VMA1 | VMA2 | VMA3 |
+ * +------+------+-----------+
+ * | | | |
+ * 4k 8k 16k 400k
+ *
+ * Suppose we drop the lock after reading VMA2 due to
+ * contention, then we get:
+ *
+ * last_vma_end = 16k
+ *
+ * 1) VMA2 is freed, but VMA3 exists:
+ *
+ * vma_next(vmi) will return VMA3.
+ * In this case, just continue from VMA3.
+ *
+ * 2) VMA2 still exists:
+ *
+ * vma_next(vmi) will return VMA3.
+ * In this case, just continue from VMA3.
+ *
+ * 3) No more VMAs can be found:
+ *
+ * vma_next(vmi) will return NULL.
+ * No more things to do, just break.
+ *
+ * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
+ *
+ * vma_next(vmi) will return VMA' whose range
+ * contains last_vma_end.
+ * Iterate VMA' from last_vma_end.
+ */
+ vma = vma_next(&vmi);
+ /* Case 3 above */
+ if (!vma)
+ break;
+
+ /* Case 1 and 2 above */
+ if (vma->vm_start >= last_vma_end) {
+ smap_gather_stats(vma, &mss, 0);
+ last_vma_end = vma->vm_end;
+ continue;
+ }
+
+ /* Case 4 above */
+ if (vma->vm_end > last_vma_end) {
+ smap_gather_stats(vma, &mss, last_vma_end);
+ last_vma_end = vma->vm_end;
+ }
+ }
+ } for_each_vma(vmi, vma);
+
+empty_set:
+ show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0);
+ seq_pad(m, ' ');
+ seq_puts(m, "[rollup]\n");
+
+ __show_smap(m, &mss, true);
+
+ release_task_mempolicy(priv);
+ mmap_read_unlock(mm);
+
+out_put_mm:
+ mmput(mm);
+out_put_task:
+ put_task_struct(priv->task);
+ priv->task = NULL;
+
+ return ret;
}
+#undef SEQ_PUT_DEC
static const struct seq_operations proc_pid_smaps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
- .show = show_pid_smap
-};
-
-static const struct seq_operations proc_tid_smaps_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = show_tid_smap
+ .show = show_smap
};
static int pid_smaps_open(struct inode *inode, struct file *file)
@@ -836,9 +1516,45 @@ static int pid_smaps_open(struct inode *inode, struct file *file)
return do_maps_open(inode, file, &proc_pid_smaps_op);
}
-static int tid_smaps_open(struct inode *inode, struct file *file)
+static int smaps_rollup_open(struct inode *inode, struct file *file)
{
- return do_maps_open(inode, file, &proc_tid_smaps_op);
+ int ret;
+ struct proc_maps_private *priv;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT);
+ if (!priv)
+ return -ENOMEM;
+
+ ret = single_open(file, show_smaps_rollup, priv);
+ if (ret)
+ goto out_free;
+
+ priv->inode = inode;
+ priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) {
+ ret = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH;
+
+ single_release(inode, file);
+ goto out_free;
+ }
+
+ return 0;
+
+out_free:
+ kfree(priv);
+ return ret;
+}
+
+static int smaps_rollup_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct proc_maps_private *priv = seq->private;
+
+ if (priv->lock_ctx.mm)
+ mmdrop(priv->lock_ctx.mm);
+
+ kfree(priv);
+ return single_release(inode, file);
}
const struct file_operations proc_pid_smaps_operations = {
@@ -848,11 +1564,11 @@ const struct file_operations proc_pid_smaps_operations = {
.release = proc_map_release,
};
-const struct file_operations proc_tid_smaps_operations = {
- .open = tid_smaps_open,
+const struct file_operations proc_pid_smaps_rollup_operations = {
+ .open = smaps_rollup_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = proc_map_release,
+ .release = smaps_rollup_release,
};
enum clear_refs_types {
@@ -868,52 +1584,78 @@ struct clear_refs_private {
enum clear_refs_types type;
};
-#ifdef CONFIG_MEM_SOFT_DIRTY
+static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
+{
+ struct folio *folio;
+
+ if (!pte_write(pte))
+ return false;
+ if (!is_cow_mapping(vma->vm_flags))
+ return false;
+ if (likely(!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm)))
+ return false;
+ folio = vm_normal_folio(vma, addr, pte);
+ if (!folio)
+ return false;
+ return folio_maybe_dma_pinned(folio);
+}
+
static inline void clear_soft_dirty(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte)
{
+ if (!pgtable_supports_soft_dirty())
+ return;
/*
* The soft-dirty tracker uses #PF-s to catch writes
* to pages, so write-protect the pte as well. See the
- * Documentation/vm/soft-dirty.txt for full description
+ * Documentation/admin-guide/mm/soft-dirty.rst for full description
* of how soft-dirty works.
*/
- pte_t ptent = *pte;
+ pte_t ptent = ptep_get(pte);
+
+ if (pte_none(ptent))
+ return;
if (pte_present(ptent)) {
- ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
- ptent = pte_wrprotect(ptent);
+ pte_t old_pte;
+
+ if (pte_is_pinned(vma, addr, ptent))
+ return;
+ old_pte = ptep_modify_prot_start(vma, addr, pte);
+ ptent = pte_wrprotect(old_pte);
ptent = pte_clear_soft_dirty(ptent);
- ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
- } else if (is_swap_pte(ptent)) {
+ ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
+ } else {
ptent = pte_swp_clear_soft_dirty(ptent);
set_pte_at(vma->vm_mm, addr, pte, ptent);
}
}
-#else
-static inline void clear_soft_dirty(struct vm_area_struct *vma,
- unsigned long addr, pte_t *pte)
-{
-}
-#endif
-#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
- pmd_t pmd = *pmdp;
+ pmd_t old, pmd = *pmdp;
- /* See comment in change_huge_pmd() */
- pmdp_invalidate(vma, addr, pmdp);
- if (pmd_dirty(*pmdp))
- pmd = pmd_mkdirty(pmd);
- if (pmd_young(*pmdp))
- pmd = pmd_mkyoung(pmd);
-
- pmd = pmd_wrprotect(pmd);
- pmd = pmd_clear_soft_dirty(pmd);
+ if (!pgtable_supports_soft_dirty())
+ return;
- set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ if (pmd_present(pmd)) {
+ /* See comment in change_huge_pmd() */
+ old = pmdp_invalidate(vma, addr, pmdp);
+ if (pmd_dirty(old))
+ pmd = pmd_mkdirty(pmd);
+ if (pmd_young(old))
+ pmd = pmd_mkyoung(pmd);
+
+ pmd = pmd_wrprotect(pmd);
+ pmd = pmd_clear_soft_dirty(pmd);
+
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ } else if (pmd_is_migration_entry(pmd)) {
+ pmd = pmd_swp_clear_soft_dirty(pmd);
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ }
}
#else
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
@@ -929,7 +1671,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
pte_t *pte, ptent;
spinlock_t *ptl;
- struct page *page;
+ struct folio *folio;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -938,23 +1680,27 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
goto out;
}
- page = pmd_page(*pmd);
+ if (!pmd_present(*pmd))
+ goto out;
+
+ folio = pmd_folio(*pmd);
/* Clear accessed and referenced bits. */
pmdp_test_and_clear_young(vma, addr, pmd);
- test_and_clear_page_young(page);
- ClearPageReferenced(page);
+ folio_test_clear_young(folio);
+ folio_clear_referenced(folio);
out:
spin_unlock(ptl);
return 0;
}
- if (pmd_trans_unstable(pmd))
- return 0;
-
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
for (; addr != end; pte++, addr += PAGE_SIZE) {
- ptent = *pte;
+ ptent = ptep_get(pte);
if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
clear_soft_dirty(vma, addr, pte);
@@ -964,14 +1710,14 @@ out:
if (!pte_present(ptent))
continue;
- page = vm_normal_page(vma, addr, ptent);
- if (!page)
+ folio = vm_normal_folio(vma, addr, ptent);
+ if (!folio)
continue;
/* Clear accessed and referenced bits. */
ptep_test_and_clear_young(vma, addr, pte);
- test_and_clear_page_young(page);
- ClearPageReferenced(page);
+ folio_test_clear_young(folio);
+ folio_clear_referenced(folio);
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
@@ -1000,18 +1746,23 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end,
return 0;
}
+static const struct mm_walk_ops clear_refs_walk_ops = {
+ .pmd_entry = clear_refs_pte_range,
+ .test_walk = clear_refs_test_walk,
+ .walk_lock = PGWALK_WRLOCK,
+};
+
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
struct mm_struct *mm;
struct vm_area_struct *vma;
enum clear_refs_types type;
int itype;
int rv;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
@@ -1028,55 +1779,46 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
return -ESRCH;
mm = get_task_mm(task);
if (mm) {
+ VMA_ITERATOR(vmi, mm, 0);
+ struct mmu_notifier_range range;
struct clear_refs_private cp = {
.type = type,
};
- struct mm_walk clear_refs_walk = {
- .pmd_entry = clear_refs_pte_range,
- .test_walk = clear_refs_test_walk,
- .mm = mm,
- .private = &cp,
- };
+ if (mmap_write_lock_killable(mm)) {
+ count = -EINTR;
+ goto out_mm;
+ }
if (type == CLEAR_REFS_MM_HIWATER_RSS) {
- if (down_write_killable(&mm->mmap_sem)) {
- count = -EINTR;
- goto out_mm;
- }
-
/*
* Writing 5 to /proc/pid/clear_refs resets the peak
* resident set size to this mm's current rss value.
*/
reset_mm_hiwater_rss(mm);
- up_write(&mm->mmap_sem);
- goto out_mm;
+ goto out_unlock;
}
- down_read(&mm->mmap_sem);
if (type == CLEAR_REFS_SOFT_DIRTY) {
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!(vma->vm_flags & VM_SOFTDIRTY))
continue;
- up_read(&mm->mmap_sem);
- if (down_write_killable(&mm->mmap_sem)) {
- count = -EINTR;
- goto out_mm;
- }
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- vma->vm_flags &= ~VM_SOFTDIRTY;
- vma_set_page_prot(vma);
- }
- downgrade_write(&mm->mmap_sem);
- break;
+ vm_flags_clear(vma, VM_SOFTDIRTY);
+ vma_set_page_prot(vma);
}
- mmu_notifier_invalidate_range_start(mm, 0, -1);
+
+ inc_tlb_flush_pending(mm);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
+ 0, mm, 0, -1UL);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+ walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp);
+ if (type == CLEAR_REFS_SOFT_DIRTY) {
+ mmu_notifier_invalidate_range_end(&range);
+ flush_tlb_mm(mm);
+ dec_tlb_flush_pending(mm);
}
- walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
- if (type == CLEAR_REFS_SOFT_DIRTY)
- mmu_notifier_invalidate_range_end(mm, 0, -1);
- flush_tlb_mm(mm);
- up_read(&mm->mmap_sem);
+out_unlock:
+ mmap_write_unlock(mm);
out_mm:
mmput(mm);
}
@@ -1108,6 +1850,8 @@ struct pagemapread {
#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
#define PM_SOFT_DIRTY BIT_ULL(55)
#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
+#define PM_UFFD_WP BIT_ULL(57)
+#define PM_GUARD_REGION BIT_ULL(58)
#define PM_FILE BIT_ULL(61)
#define PM_SWAP BIT_ULL(62)
#define PM_PRESENT BIT_ULL(63)
@@ -1119,8 +1863,7 @@ static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
}
-static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
- struct pagemapread *pm)
+static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm)
{
pm->buffer[pm->pos++] = *pme;
if (pm->pos >= pm->len)
@@ -1128,8 +1871,15 @@ static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
return 0;
}
+static bool __folio_page_mapped_exclusively(struct folio *folio, struct page *page)
+{
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ return folio_precise_page_mapcount(folio, page) == 1;
+ return !folio_maybe_mapped_shared(folio);
+}
+
static int pagemap_pte_hole(unsigned long start, unsigned long end,
- struct mm_walk *walk)
+ __always_unused int depth, struct mm_walk *walk)
{
struct pagemapread *pm = walk->private;
unsigned long addr = start;
@@ -1147,7 +1897,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
hole_end = end;
for (; addr < hole_end; addr += PAGE_SIZE) {
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
goto out;
}
@@ -1159,7 +1909,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
if (vma->vm_flags & VM_SOFTDIRTY)
pme = make_pme(0, PM_SOFT_DIRTY);
for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
goto out;
}
@@ -1173,6 +1923,10 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
{
u64 frame = 0, flags = 0;
struct page *page = NULL;
+ struct folio *folio;
+
+ if (pte_none(pte))
+ goto out;
if (pte_present(pte)) {
if (pm->show_pfn)
@@ -1181,28 +1935,134 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
page = vm_normal_page(vma, addr, pte);
if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
- } else if (is_swap_pte(pte)) {
- swp_entry_t entry;
+ if (pte_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
+ } else {
+ softleaf_t entry;
+
if (pte_swp_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
- entry = pte_to_swp_entry(pte);
- frame = swp_type(entry) |
- (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
+ if (pte_swp_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
+ entry = softleaf_from_pte(pte);
+ if (pm->show_pfn) {
+ pgoff_t offset;
+
+ /*
+ * For PFN swap offsets, keeping the offset field
+ * to be PFN only to be compatible with old smaps.
+ */
+ if (softleaf_has_pfn(entry))
+ offset = softleaf_to_pfn(entry);
+ else
+ offset = swp_offset(entry);
+ frame = swp_type(entry) |
+ (offset << MAX_SWAPFILES_SHIFT);
+ }
flags |= PM_SWAP;
- if (is_migration_entry(entry))
- page = migration_entry_to_page(entry);
+ if (softleaf_has_pfn(entry))
+ page = softleaf_to_page(entry);
+ if (softleaf_is_uffd_wp_marker(entry))
+ flags |= PM_UFFD_WP;
+ if (softleaf_is_guard_marker(entry))
+ flags |= PM_GUARD_REGION;
+ }
+
+ if (page) {
+ folio = page_folio(page);
+ if (!folio_test_anon(folio))
+ flags |= PM_FILE;
+ if ((flags & PM_PRESENT) &&
+ __folio_page_mapped_exclusively(folio, page))
+ flags |= PM_MMAP_EXCLUSIVE;
}
- if (page && !PageAnon(page))
- flags |= PM_FILE;
- if (page && page_mapcount(page) == 1)
- flags |= PM_MMAP_EXCLUSIVE;
+out:
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
return make_pme(frame, flags);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr,
+ unsigned long end, struct vm_area_struct *vma,
+ struct pagemapread *pm)
+{
+ unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT;
+ u64 flags = 0, frame = 0;
+ pmd_t pmd = *pmdp;
+ struct page *page = NULL;
+ struct folio *folio = NULL;
+ int err = 0;
+
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ flags |= PM_SOFT_DIRTY;
+
+ if (pmd_none(pmd))
+ goto populate_pagemap;
+
+ if (pmd_present(pmd)) {
+ page = pmd_page(pmd);
+
+ flags |= PM_PRESENT;
+ if (pmd_soft_dirty(pmd))
+ flags |= PM_SOFT_DIRTY;
+ if (pmd_uffd_wp(pmd))
+ flags |= PM_UFFD_WP;
+ if (pm->show_pfn)
+ frame = pmd_pfn(pmd) + idx;
+ } else if (thp_migration_supported()) {
+ const softleaf_t entry = softleaf_from_pmd(pmd);
+ unsigned long offset;
+
+ if (pm->show_pfn) {
+ if (softleaf_has_pfn(entry))
+ offset = softleaf_to_pfn(entry) + idx;
+ else
+ offset = swp_offset(entry) + idx;
+ frame = swp_type(entry) |
+ (offset << MAX_SWAPFILES_SHIFT);
+ }
+ flags |= PM_SWAP;
+ if (pmd_swp_soft_dirty(pmd))
+ flags |= PM_SOFT_DIRTY;
+ if (pmd_swp_uffd_wp(pmd))
+ flags |= PM_UFFD_WP;
+ VM_WARN_ON_ONCE(!pmd_is_migration_entry(pmd));
+ page = softleaf_to_page(entry);
+ }
+
+ if (page) {
+ folio = page_folio(page);
+ if (!folio_test_anon(folio))
+ flags |= PM_FILE;
+ }
+
+populate_pagemap:
+ for (; addr != end; addr += PAGE_SIZE, idx++) {
+ u64 cur_flags = flags;
+ pagemap_entry_t pme;
+
+ if (folio && (flags & PM_PRESENT) &&
+ __folio_page_mapped_exclusively(folio, page))
+ cur_flags |= PM_MMAP_EXCLUSIVE;
+
+ pme = make_pme(frame, cur_flags);
+ err = add_to_pagemap(&pme, pm);
+ if (err)
+ break;
+ if (pm->show_pfn) {
+ if (flags & PM_PRESENT)
+ frame++;
+ else if (flags & PM_SWAP)
+ frame += (1 << MAX_SWAPFILES_SHIFT);
+ }
+ }
+ return err;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
@@ -1215,57 +2075,26 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
ptl = pmd_trans_huge_lock(pmdp, vma);
if (ptl) {
- u64 flags = 0, frame = 0;
- pmd_t pmd = *pmdp;
-
- if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
- flags |= PM_SOFT_DIRTY;
-
- /*
- * Currently pmd for thp is always present because thp
- * can not be swapped-out, migrated, or HWPOISONed
- * (split in such cases instead.)
- * This if-check is just to prepare for future implementation.
- */
- if (pmd_present(pmd)) {
- struct page *page = pmd_page(pmd);
-
- if (page_mapcount(page) == 1)
- flags |= PM_MMAP_EXCLUSIVE;
-
- flags |= PM_PRESENT;
- if (pm->show_pfn)
- frame = pmd_pfn(pmd) +
- ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- }
-
- for (; addr != end; addr += PAGE_SIZE) {
- pagemap_entry_t pme = make_pme(frame, flags);
-
- err = add_to_pagemap(addr, &pme, pm);
- if (err)
- break;
- if (pm->show_pfn && (flags & PM_PRESENT))
- frame++;
- }
+ err = pagemap_pmd_range_thp(pmdp, addr, end, vma, pm);
spin_unlock(ptl);
return err;
}
-
- if (pmd_trans_unstable(pmdp))
- return 0;
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
/*
* We can assume that @vma always points to a valid one and @end never
* goes beyond vma->vm_end.
*/
orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return err;
+ }
for (; addr < end; pte++, addr += PAGE_SIZE) {
pagemap_entry_t pme;
- pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
- err = add_to_pagemap(addr, &pme, pm);
+ pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte));
+ err = add_to_pagemap(&pme, pm);
if (err)
break;
}
@@ -1285,44 +2114,62 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
struct pagemapread *pm = walk->private;
struct vm_area_struct *vma = walk->vma;
u64 flags = 0, frame = 0;
+ spinlock_t *ptl;
int err = 0;
pte_t pte;
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
- pte = huge_ptep_get(ptep);
+ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, ptep);
+ pte = huge_ptep_get(walk->mm, addr, ptep);
if (pte_present(pte)) {
- struct page *page = pte_page(pte);
+ struct folio *folio = page_folio(pte_page(pte));
- if (!PageAnon(page))
+ if (!folio_test_anon(folio))
flags |= PM_FILE;
- if (page_mapcount(page) == 1)
+ if (!folio_maybe_mapped_shared(folio) &&
+ !hugetlb_pmd_shared(ptep))
flags |= PM_MMAP_EXCLUSIVE;
+ if (huge_pte_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
+
flags |= PM_PRESENT;
if (pm->show_pfn)
frame = pte_pfn(pte) +
((addr & ~hmask) >> PAGE_SHIFT);
+ } else if (pte_swp_uffd_wp_any(pte)) {
+ flags |= PM_UFFD_WP;
}
for (; addr != end; addr += PAGE_SIZE) {
pagemap_entry_t pme = make_pme(frame, flags);
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
- return err;
+ break;
if (pm->show_pfn && (flags & PM_PRESENT))
frame++;
}
+ spin_unlock(ptl);
cond_resched();
return err;
}
+#else
+#define pagemap_hugetlb_range NULL
#endif /* HUGETLB_PAGE */
+static const struct mm_walk_ops pagemap_ops = {
+ .pmd_entry = pagemap_pmd_range,
+ .pte_hole = pagemap_pte_hole,
+ .hugetlb_entry = pagemap_hugetlb_range,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
/*
* /proc/pid/pagemap - an array mapping virtual pages to pfns
*
@@ -1332,9 +2179,11 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
* Bits 0-54 page frame number (PFN) if present
* Bits 0-4 swap type if swapped
* Bits 5-54 swap offset if swapped
- * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
+ * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
* Bit 56 page exclusively mapped
- * Bits 57-60 zero
+ * Bit 57 pte is uffd-wp write-protected
+ * Bit 58 pte is a guard region
+ * Bits 59-60 zero
* Bit 61 page is file-page or shared-anon
* Bit 62 page swapped
* Bit 63 page present
@@ -1354,7 +2203,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
{
struct mm_struct *mm = file->private_data;
struct pagemapread pm;
- struct mm_walk pagemap_walk = {};
unsigned long src;
unsigned long svpfn;
unsigned long start_vaddr;
@@ -1377,34 +2225,35 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
- pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
+ pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL);
ret = -ENOMEM;
if (!pm.buffer)
goto out_mm;
- pagemap_walk.pmd_entry = pagemap_pmd_range;
- pagemap_walk.pte_hole = pagemap_pte_hole;
-#ifdef CONFIG_HUGETLB_PAGE
- pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
-#endif
- pagemap_walk.mm = mm;
- pagemap_walk.private = &pm;
-
src = *ppos;
svpfn = src / PM_ENTRY_BYTES;
- start_vaddr = svpfn << PAGE_SHIFT;
end_vaddr = mm->task_size;
/* watch out for wraparound */
- if (svpfn > mm->task_size >> PAGE_SHIFT)
+ start_vaddr = end_vaddr;
+ if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) {
+ unsigned long end;
+
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ goto out_free;
+ start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT);
+ mmap_read_unlock(mm);
+
+ end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT);
+ if (end >= start_vaddr && end < mm->task_size)
+ end_vaddr = end;
+ }
+
+ /* Ensure the address is inside the task */
+ if (start_vaddr > mm->task_size)
start_vaddr = end_vaddr;
- /*
- * The odds are that this will stop walking way
- * before end_vaddr, because the length of the
- * user buffer is tracked in "pm", and the walk
- * will stop when we hit the end of the buffer.
- */
ret = 0;
while (count && (start_vaddr < end_vaddr)) {
int len;
@@ -1415,9 +2264,11 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
/* overflow ? */
if (end < start_vaddr || end > end_vaddr)
end = end_vaddr;
- down_read(&mm->mmap_sem);
- ret = walk_page_range(start_vaddr, end, &pagemap_walk);
- up_read(&mm->mmap_sem);
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ goto out_free;
+ ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
+ mmap_read_unlock(mm);
start_vaddr = end;
len = min(count, PM_ENTRY_BYTES * pm.pos);
@@ -1446,8 +2297,8 @@ static int pagemap_open(struct inode *inode, struct file *file)
struct mm_struct *mm;
mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(mm))
- return PTR_ERR(mm);
+ if (IS_ERR_OR_NULL(mm))
+ return mm ? PTR_ERR(mm) : -ESRCH;
file->private_data = mm;
return 0;
}
@@ -1461,11 +2312,794 @@ static int pagemap_release(struct inode *inode, struct file *file)
return 0;
}
+#define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \
+ PAGE_IS_FILE | PAGE_IS_PRESENT | \
+ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \
+ PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \
+ PAGE_IS_GUARD)
+#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
+
+struct pagemap_scan_private {
+ struct pm_scan_arg arg;
+ unsigned long masks_of_interest, cur_vma_category;
+ struct page_region *vec_buf;
+ unsigned long vec_buf_len, vec_buf_index, found_pages;
+ struct page_region __user *vec_out;
+};
+
+static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
+ struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte)
+{
+ unsigned long categories;
+
+ if (pte_none(pte))
+ return 0;
+
+ if (pte_present(pte)) {
+ struct page *page;
+
+ categories = PAGE_IS_PRESENT;
+
+ if (!pte_uffd_wp(pte))
+ categories |= PAGE_IS_WRITTEN;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ page = vm_normal_page(vma, addr, pte);
+ if (page && !PageAnon(page))
+ categories |= PAGE_IS_FILE;
+ }
+
+ if (is_zero_pfn(pte_pfn(pte)))
+ categories |= PAGE_IS_PFNZERO;
+ if (pte_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ } else {
+ softleaf_t entry;
+
+ categories = PAGE_IS_SWAPPED;
+
+ if (!pte_swp_uffd_wp_any(pte))
+ categories |= PAGE_IS_WRITTEN;
+
+ entry = softleaf_from_pte(pte);
+ if (softleaf_is_guard_marker(entry))
+ categories |= PAGE_IS_GUARD;
+ else if ((p->masks_of_interest & PAGE_IS_FILE) &&
+ softleaf_has_pfn(entry) &&
+ !folio_test_anon(softleaf_to_folio(entry)))
+ categories |= PAGE_IS_FILE;
+
+ if (pte_swp_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ }
+
+ return categories;
+}
+
+static void make_uffd_wp_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte, pte_t ptent)
+{
+ if (pte_present(ptent)) {
+ pte_t old_pte;
+
+ old_pte = ptep_modify_prot_start(vma, addr, pte);
+ ptent = pte_mkuffd_wp(old_pte);
+ ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
+ } else if (pte_none(ptent)) {
+ set_pte_at(vma->vm_mm, addr, pte,
+ make_pte_marker(PTE_MARKER_UFFD_WP));
+ } else {
+ ptent = pte_swp_mkuffd_wp(ptent);
+ set_pte_at(vma->vm_mm, addr, pte, ptent);
+ }
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
+ struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd)
+{
+ unsigned long categories = PAGE_IS_HUGE;
+
+ if (pmd_none(pmd))
+ return categories;
+
+ if (pmd_present(pmd)) {
+ struct page *page;
+
+ categories |= PAGE_IS_PRESENT;
+ if (!pmd_uffd_wp(pmd))
+ categories |= PAGE_IS_WRITTEN;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ page = vm_normal_page_pmd(vma, addr, pmd);
+ if (page && !PageAnon(page))
+ categories |= PAGE_IS_FILE;
+ }
+
+ if (is_huge_zero_pmd(pmd))
+ categories |= PAGE_IS_PFNZERO;
+ if (pmd_soft_dirty(pmd))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ } else {
+ categories |= PAGE_IS_SWAPPED;
+ if (!pmd_swp_uffd_wp(pmd))
+ categories |= PAGE_IS_WRITTEN;
+ if (pmd_swp_soft_dirty(pmd))
+ categories |= PAGE_IS_SOFT_DIRTY;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ const softleaf_t entry = softleaf_from_pmd(pmd);
+
+ if (softleaf_has_pfn(entry) &&
+ !folio_test_anon(softleaf_to_folio(entry)))
+ categories |= PAGE_IS_FILE;
+ }
+ }
+
+ return categories;
+}
+
+static void make_uffd_wp_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t old, pmd = *pmdp;
+
+ if (pmd_present(pmd)) {
+ old = pmdp_invalidate_ad(vma, addr, pmdp);
+ pmd = pmd_mkuffd_wp(old);
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ } else if (pmd_is_migration_entry(pmd)) {
+ pmd = pmd_swp_mkuffd_wp(pmd);
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long pagemap_hugetlb_category(pte_t pte)
+{
+ unsigned long categories = PAGE_IS_HUGE;
+
+ if (pte_none(pte))
+ return categories;
+
+ /*
+ * According to pagemap_hugetlb_range(), file-backed HugeTLB
+ * page cannot be swapped. So PAGE_IS_FILE is not checked for
+ * swapped pages.
+ */
+ if (pte_present(pte)) {
+ categories |= PAGE_IS_PRESENT;
+
+ if (!huge_pte_uffd_wp(pte))
+ categories |= PAGE_IS_WRITTEN;
+ if (!PageAnon(pte_page(pte)))
+ categories |= PAGE_IS_FILE;
+ if (is_zero_pfn(pte_pfn(pte)))
+ categories |= PAGE_IS_PFNZERO;
+ if (pte_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ } else {
+ categories |= PAGE_IS_SWAPPED;
+
+ if (!pte_swp_uffd_wp_any(pte))
+ categories |= PAGE_IS_WRITTEN;
+ if (pte_swp_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ }
+
+ return categories;
+}
+
+static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t ptent)
+{
+ const unsigned long psize = huge_page_size(hstate_vma(vma));
+ softleaf_t entry;
+
+ if (huge_pte_none(ptent)) {
+ set_huge_pte_at(vma->vm_mm, addr, ptep,
+ make_pte_marker(PTE_MARKER_UFFD_WP), psize);
+ return;
+ }
+
+ entry = softleaf_from_pte(ptent);
+ if (softleaf_is_hwpoison(entry) || softleaf_is_marker(entry))
+ return;
+
+ if (softleaf_is_migration(entry))
+ set_huge_pte_at(vma->vm_mm, addr, ptep,
+ pte_swp_mkuffd_wp(ptent), psize);
+ else
+ huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
+ huge_pte_mkuffd_wp(ptent));
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
+static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
+ unsigned long addr, unsigned long end)
+{
+ struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+
+ if (!p->vec_buf)
+ return;
+
+ if (cur_buf->start != addr)
+ cur_buf->end = addr;
+ else
+ cur_buf->start = cur_buf->end = 0;
+
+ p->found_pages -= (end - addr) / PAGE_SIZE;
+}
+#endif
+
+static bool pagemap_scan_is_interesting_page(unsigned long categories,
+ const struct pagemap_scan_private *p)
+{
+ categories ^= p->arg.category_inverted;
+ if ((categories & p->arg.category_mask) != p->arg.category_mask)
+ return false;
+ if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask))
+ return false;
+
+ return true;
+}
+
+static bool pagemap_scan_is_interesting_vma(unsigned long categories,
+ const struct pagemap_scan_private *p)
+{
+ unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED;
+
+ categories ^= p->arg.category_inverted;
+ if ((categories & required) != required)
+ return false;
+
+ return true;
+}
+
+static int pagemap_scan_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long vma_category = 0;
+ bool wp_allowed = userfaultfd_wp_async(vma) &&
+ userfaultfd_wp_use_markers(vma);
+
+ if (!wp_allowed) {
+ /* User requested explicit failure over wp-async capability */
+ if (p->arg.flags & PM_SCAN_CHECK_WPASYNC)
+ return -EPERM;
+ /*
+ * User requires wr-protect, and allows silently skipping
+ * unsupported vmas.
+ */
+ if (p->arg.flags & PM_SCAN_WP_MATCHING)
+ return 1;
+ /*
+ * Then the request doesn't involve wr-protects at all,
+ * fall through to the rest checks, and allow vma walk.
+ */
+ }
+
+ if (vma->vm_flags & VM_PFNMAP)
+ return 1;
+
+ if (wp_allowed)
+ vma_category |= PAGE_IS_WPALLOWED;
+
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ vma_category |= PAGE_IS_SOFT_DIRTY;
+
+ if (!pagemap_scan_is_interesting_vma(vma_category, p))
+ return 1;
+
+ p->cur_vma_category = vma_category;
+
+ return 0;
+}
+
+static bool pagemap_scan_push_range(unsigned long categories,
+ struct pagemap_scan_private *p,
+ unsigned long addr, unsigned long end)
+{
+ struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+
+ /*
+ * When there is no output buffer provided at all, the sentinel values
+ * won't match here. There is no other way for `cur_buf->end` to be
+ * non-zero other than it being non-empty.
+ */
+ if (addr == cur_buf->end && categories == cur_buf->categories) {
+ cur_buf->end = end;
+ return true;
+ }
+
+ if (cur_buf->end) {
+ if (p->vec_buf_index >= p->vec_buf_len - 1)
+ return false;
+
+ cur_buf = &p->vec_buf[++p->vec_buf_index];
+ }
+
+ cur_buf->start = addr;
+ cur_buf->end = end;
+ cur_buf->categories = categories;
+
+ return true;
+}
+
+static int pagemap_scan_output(unsigned long categories,
+ struct pagemap_scan_private *p,
+ unsigned long addr, unsigned long *end)
+{
+ unsigned long n_pages, total_pages;
+ int ret = 0;
+
+ if (!p->vec_buf)
+ return 0;
+
+ categories &= p->arg.return_mask;
+
+ n_pages = (*end - addr) / PAGE_SIZE;
+ if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
+ total_pages > p->arg.max_pages) {
+ size_t n_too_much = total_pages - p->arg.max_pages;
+ *end -= n_too_much * PAGE_SIZE;
+ n_pages -= n_too_much;
+ ret = -ENOSPC;
+ }
+
+ if (!pagemap_scan_push_range(categories, p, addr, *end)) {
+ *end = addr;
+ n_pages = 0;
+ ret = -ENOSPC;
+ }
+
+ p->found_pages += n_pages;
+ if (ret)
+ p->arg.walk_end = *end;
+
+ return ret;
+}
+
+static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
+ unsigned long end, struct mm_walk *walk)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long categories;
+ spinlock_t *ptl;
+ int ret = 0;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return -ENOENT;
+
+ categories = p->cur_vma_category |
+ pagemap_thp_category(p, vma, start, *pmd);
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ goto out_unlock;
+
+ ret = pagemap_scan_output(categories, p, start, &end);
+ if (start == end)
+ goto out_unlock;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ goto out_unlock;
+ if (~categories & PAGE_IS_WRITTEN)
+ goto out_unlock;
+
+ /*
+ * Break huge page into small pages if the WP operation
+ * needs to be performed on a portion of the huge page.
+ */
+ if (end != start + HPAGE_SIZE) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, start);
+ pagemap_scan_backout_range(p, start, end);
+ /* Report as if there was no THP */
+ return -ENOENT;
+ }
+
+ make_uffd_wp_pmd(vma, start, pmd);
+ flush_tlb_range(vma, start, end);
+out_unlock:
+ spin_unlock(ptl);
+ return ret;
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+ return -ENOENT;
+#endif
+}
+
+static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long addr, flush_end = 0;
+ pte_t *pte, *start_pte;
+ spinlock_t *ptl;
+ int ret;
+
+ ret = pagemap_scan_thp_entry(pmd, start, end, walk);
+ if (ret != -ENOENT)
+ return ret;
+
+ ret = 0;
+ start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+
+ arch_enter_lazy_mmu_mode();
+
+ if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
+ /* Fast path for performing exclusive WP */
+ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+ pte_t ptent = ptep_get(pte);
+
+ if ((pte_present(ptent) && pte_uffd_wp(ptent)) ||
+ pte_swp_uffd_wp_any(ptent))
+ continue;
+ make_uffd_wp_pte(vma, addr, pte, ptent);
+ if (!flush_end)
+ start = addr;
+ flush_end = addr + PAGE_SIZE;
+ }
+ goto flush_and_return;
+ }
+
+ if (!p->arg.category_anyof_mask && !p->arg.category_inverted &&
+ p->arg.category_mask == PAGE_IS_WRITTEN &&
+ p->arg.return_mask == PAGE_IS_WRITTEN) {
+ for (addr = start; addr < end; pte++, addr += PAGE_SIZE) {
+ unsigned long next = addr + PAGE_SIZE;
+ pte_t ptent = ptep_get(pte);
+
+ if ((pte_present(ptent) && pte_uffd_wp(ptent)) ||
+ pte_swp_uffd_wp_any(ptent))
+ continue;
+ ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN,
+ p, addr, &next);
+ if (next == addr)
+ break;
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ continue;
+ make_uffd_wp_pte(vma, addr, pte, ptent);
+ if (!flush_end)
+ start = addr;
+ flush_end = next;
+ }
+ goto flush_and_return;
+ }
+
+ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+ pte_t ptent = ptep_get(pte);
+ unsigned long categories = p->cur_vma_category |
+ pagemap_page_category(p, vma, addr, ptent);
+ unsigned long next = addr + PAGE_SIZE;
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ continue;
+
+ ret = pagemap_scan_output(categories, p, addr, &next);
+ if (next == addr)
+ break;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ continue;
+ if (~categories & PAGE_IS_WRITTEN)
+ continue;
+
+ make_uffd_wp_pte(vma, addr, pte, ptent);
+ if (!flush_end)
+ start = addr;
+ flush_end = next;
+ }
+
+flush_and_return:
+ if (flush_end)
+ flush_tlb_range(vma, start, addr);
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+
+ cond_resched();
+ return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
+ unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long categories;
+ spinlock_t *ptl;
+ int ret = 0;
+ pte_t pte;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
+ /* Go the short route when not write-protecting pages. */
+
+ pte = huge_ptep_get(walk->mm, start, ptep);
+ categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ return 0;
+
+ return pagemap_scan_output(categories, p, start, &end);
+ }
+
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
+
+ pte = huge_ptep_get(walk->mm, start, ptep);
+ categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ goto out_unlock;
+
+ ret = pagemap_scan_output(categories, p, start, &end);
+ if (start == end)
+ goto out_unlock;
+
+ if (~categories & PAGE_IS_WRITTEN)
+ goto out_unlock;
+
+ if (end != start + HPAGE_SIZE) {
+ /* Partial HugeTLB page WP isn't possible. */
+ pagemap_scan_backout_range(p, start, end);
+ p->arg.walk_end = start;
+ ret = 0;
+ goto out_unlock;
+ }
+
+ make_uffd_wp_huge_pte(vma, start, ptep, pte);
+ flush_hugetlb_tlb_range(vma, start, end);
+
+out_unlock:
+ spin_unlock(ptl);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+
+ return ret;
+}
+#else
+#define pagemap_scan_hugetlb_entry NULL
+#endif
+
+static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
+ int depth, struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ int ret, err;
+
+ if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p))
+ return 0;
+
+ ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end);
+ if (addr == end)
+ return ret;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ return ret;
+
+ err = uffd_wp_range(vma, addr, end - addr, true);
+ if (err < 0)
+ ret = err;
+
+ return ret;
+}
+
+static const struct mm_walk_ops pagemap_scan_ops = {
+ .test_walk = pagemap_scan_test_walk,
+ .pmd_entry = pagemap_scan_pmd_entry,
+ .pte_hole = pagemap_scan_pte_hole,
+ .hugetlb_entry = pagemap_scan_hugetlb_entry,
+};
+
+static int pagemap_scan_get_args(struct pm_scan_arg *arg,
+ unsigned long uarg)
+{
+ if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg)))
+ return -EFAULT;
+
+ if (arg->size != sizeof(struct pm_scan_arg))
+ return -EINVAL;
+
+ /* Validate requested features */
+ if (arg->flags & ~PM_SCAN_FLAGS)
+ return -EINVAL;
+ if ((arg->category_inverted | arg->category_mask |
+ arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES)
+ return -EINVAL;
+
+ arg->start = untagged_addr((unsigned long)arg->start);
+ arg->end = untagged_addr((unsigned long)arg->end);
+ arg->vec = untagged_addr((unsigned long)arg->vec);
+
+ /* Validate memory pointers */
+ if (!IS_ALIGNED(arg->start, PAGE_SIZE))
+ return -EINVAL;
+ if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start))
+ return -EFAULT;
+ if (!arg->vec && arg->vec_len)
+ return -EINVAL;
+ if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX)
+ return -EINVAL;
+ if (arg->vec && !access_ok((void __user *)(long)arg->vec,
+ size_mul(arg->vec_len, sizeof(struct page_region))))
+ return -EFAULT;
+
+ /* Fixup default values */
+ arg->end = ALIGN(arg->end, PAGE_SIZE);
+ arg->walk_end = 0;
+ if (!arg->max_pages)
+ arg->max_pages = ULONG_MAX;
+
+ return 0;
+}
+
+static int pagemap_scan_writeback_args(struct pm_scan_arg *arg,
+ unsigned long uargl)
+{
+ struct pm_scan_arg __user *uarg = (void __user *)uargl;
+
+ if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
+{
+ if (!p->arg.vec_len)
+ return 0;
+
+ p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
+ p->arg.vec_len);
+ p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
+ GFP_KERNEL);
+ if (!p->vec_buf)
+ return -ENOMEM;
+
+ p->vec_buf->start = p->vec_buf->end = 0;
+ p->vec_out = (struct page_region __user *)(long)p->arg.vec;
+
+ return 0;
+}
+
+static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
+{
+ const struct page_region *buf = p->vec_buf;
+ long n = p->vec_buf_index;
+
+ if (!p->vec_buf)
+ return 0;
+
+ if (buf[n].end != buf[n].start)
+ n++;
+
+ if (!n)
+ return 0;
+
+ if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
+ return -EFAULT;
+
+ p->arg.vec_len -= n;
+ p->vec_out += n;
+
+ p->vec_buf_index = 0;
+ p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len);
+ p->vec_buf->start = p->vec_buf->end = 0;
+
+ return n;
+}
+
+static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
+{
+ struct pagemap_scan_private p = {0};
+ unsigned long walk_start;
+ size_t n_ranges_out = 0;
+ int ret;
+
+ ret = pagemap_scan_get_args(&p.arg, uarg);
+ if (ret)
+ return ret;
+
+ p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask |
+ p.arg.return_mask;
+ ret = pagemap_scan_init_bounce_buffer(&p);
+ if (ret)
+ return ret;
+
+ for (walk_start = p.arg.start; walk_start < p.arg.end;
+ walk_start = p.arg.walk_end) {
+ struct mmu_notifier_range range;
+ long n_out;
+
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ break;
+
+ /* Protection change for the range is going to happen. */
+ if (p.arg.flags & PM_SCAN_WP_MATCHING) {
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
+ mm, walk_start, p.arg.end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+
+ ret = walk_page_range(mm, walk_start, p.arg.end,
+ &pagemap_scan_ops, &p);
+
+ if (p.arg.flags & PM_SCAN_WP_MATCHING)
+ mmu_notifier_invalidate_range_end(&range);
+
+ mmap_read_unlock(mm);
+
+ n_out = pagemap_scan_flush_buffer(&p);
+ if (n_out < 0)
+ ret = n_out;
+ else
+ n_ranges_out += n_out;
+
+ if (ret != -ENOSPC)
+ break;
+
+ if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages)
+ break;
+ }
+
+ /* ENOSPC signifies early stop (buffer full) from the walk. */
+ if (!ret || ret == -ENOSPC)
+ ret = n_ranges_out;
+
+ /* The walk_end isn't set when ret is zero */
+ if (!p.arg.walk_end)
+ p.arg.walk_end = p.arg.end;
+ if (pagemap_scan_writeback_args(&p.arg, uarg))
+ ret = -EFAULT;
+
+ kfree(p.vec_buf);
+ return ret;
+}
+
+static long do_pagemap_cmd(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mm_struct *mm = file->private_data;
+
+ switch (cmd) {
+ case PAGEMAP_SCAN:
+ return do_pagemap_scan(mm, arg);
+
+ default:
+ return -EINVAL;
+ }
+}
+
const struct file_operations proc_pagemap_operations = {
.llseek = mem_lseek, /* borrow this */
.read = pagemap_read,
.open = pagemap_open,
.release = pagemap_release,
+ .unlocked_ioctl = do_pagemap_cmd,
+ .compat_ioctl = do_pagemap_cmd,
};
#endif /* CONFIG_PROC_PAGE_MONITOR */
@@ -1490,28 +3124,34 @@ struct numa_maps_private {
static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
unsigned long nr_pages)
{
- int count = page_mapcount(page);
+ struct folio *folio = page_folio(page);
+ int count;
+
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ count = folio_precise_page_mapcount(folio, page);
+ else
+ count = folio_average_page_mapcount(folio);
md->pages += nr_pages;
- if (pte_dirty || PageDirty(page))
+ if (pte_dirty || folio_test_dirty(folio))
md->dirty += nr_pages;
- if (PageSwapCache(page))
+ if (folio_test_swapcache(folio))
md->swapcache += nr_pages;
- if (PageActive(page) || PageUnevictable(page))
+ if (folio_test_active(folio) || folio_test_unevictable(folio))
md->active += nr_pages;
- if (PageWriteback(page))
+ if (folio_test_writeback(folio))
md->writeback += nr_pages;
- if (PageAnon(page))
+ if (folio_test_anon(folio))
md->anon += nr_pages;
if (count > md->mapcount_max)
md->mapcount_max = count;
- md->node[page_to_nid(page)] += nr_pages;
+ md->node[folio_nid(folio)] += nr_pages;
}
static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
@@ -1524,7 +3164,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
return NULL;
page = vm_normal_page(vma, addr, pte);
- if (!page)
+ if (!page || is_zone_device_page(page))
return NULL;
if (PageReserved(page))
@@ -1584,16 +3224,18 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
spin_unlock(ptl);
return 0;
}
-
- if (pmd_trans_unstable(pmd))
- return 0;
#endif
orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
do {
- struct page *page = can_gather_numa_stats(*pte, vma, addr);
+ pte_t ptent = ptep_get(pte);
+ struct page *page = can_gather_numa_stats(ptent, vma, addr);
if (!page)
continue;
- gather_stats(page, md, pte_dirty(*pte), 1);
+ gather_stats(page, md, pte_dirty(ptent), 1);
} while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap_unlock(orig_pte, ptl);
@@ -1604,19 +3246,22 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end, struct mm_walk *walk)
{
- pte_t huge_pte = huge_ptep_get(pte);
+ pte_t huge_pte;
struct numa_maps *md;
struct page *page;
+ spinlock_t *ptl;
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+ huge_pte = huge_ptep_get(walk->mm, addr, pte);
if (!pte_present(huge_pte))
- return 0;
+ goto out;
page = pte_page(huge_pte);
- if (!page)
- return 0;
md = walk->private;
gather_stats(page, md, pte_dirty(huge_pte), 1);
+out:
+ spin_unlock(ptl);
return 0;
}
@@ -1628,10 +3273,16 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
}
#endif
+static const struct mm_walk_ops show_numa_ops = {
+ .hugetlb_entry = gather_hugetlb_stats,
+ .pmd_entry = gather_pte_stats,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
/*
* Display pages allocated per node and memory policy via /proc.
*/
-static int show_numa_map(struct seq_file *m, void *v, int is_pid)
+static int show_numa_map(struct seq_file *m, void *v)
{
struct numa_maps_private *numa_priv = m->private;
struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
@@ -1639,14 +3290,9 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
struct numa_maps *md = &numa_priv->md;
struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
- struct mm_walk walk = {
- .hugetlb_entry = gather_hugetlb_stats,
- .pmd_entry = gather_pte_stats,
- .private = md,
- .mm = mm,
- };
- struct mempolicy *pol;
char buffer[64];
+ struct mempolicy *pol;
+ pgoff_t ilx;
int nid;
if (!mm)
@@ -1655,7 +3301,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
/* Ensure we start with an empty set of numa_maps statistics. */
memset(md, 0, sizeof(*md));
- pol = __get_vma_policy(vma, vma->vm_start);
+ pol = __get_vma_policy(vma, vma->vm_start, &ilx);
if (pol) {
mpol_to_str(buffer, sizeof(buffer), pol);
mpol_cond_put(pol);
@@ -1667,18 +3313,18 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
if (file) {
seq_puts(m, " file=");
- seq_file_path(m, file, "\n\t= ");
- } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+ seq_path(m, file_user_path(file), "\n\t= ");
+ } else if (vma_is_initial_heap(vma)) {
seq_puts(m, " heap");
- } else if (is_stack(proc_priv, vma)) {
+ } else if (vma_is_initial_stack(vma)) {
seq_puts(m, " stack");
}
if (is_vm_hugetlb_page(vma))
seq_puts(m, " huge");
- /* mmap_sem is held by m_start */
- walk_page_vma(vma, &walk);
+ /* mmap_lock is held by m_start */
+ walk_page_vma(vma, &show_numa_ops, md);
if (!md->pages)
goto out;
@@ -1711,49 +3357,20 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
out:
seq_putc(m, '\n');
- m_cache_vma(m, vma);
return 0;
}
-static int show_pid_numa_map(struct seq_file *m, void *v)
-{
- return show_numa_map(m, v, 1);
-}
-
-static int show_tid_numa_map(struct seq_file *m, void *v)
-{
- return show_numa_map(m, v, 0);
-}
-
static const struct seq_operations proc_pid_numa_maps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
- .show = show_pid_numa_map,
+ .show = show_numa_map,
};
-static const struct seq_operations proc_tid_numa_maps_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = show_tid_numa_map,
-};
-
-static int numa_maps_open(struct inode *inode, struct file *file,
- const struct seq_operations *ops)
-{
- return proc_maps_open(inode, file, ops,
- sizeof(struct numa_maps_private));
-}
-
static int pid_numa_maps_open(struct inode *inode, struct file *file)
{
- return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
-}
-
-static int tid_numa_maps_open(struct inode *inode, struct file *file)
-{
- return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
+ return proc_maps_open(inode, file, &proc_pid_numa_maps_op,
+ sizeof(struct numa_maps_private));
}
const struct file_operations proc_pid_numa_maps_operations = {
@@ -1763,10 +3380,4 @@ const struct file_operations proc_pid_numa_maps_operations = {
.release = proc_map_release,
};
-const struct file_operations proc_tid_numa_maps_operations = {
- .open = tid_numa_maps_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = proc_map_release,
-};
#endif /* CONFIG_NUMA */
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 23266694db11..d362919f4f68 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/file.h>
@@ -19,15 +20,13 @@
*/
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
struct vm_region *region;
- struct rb_node *p;
unsigned long bytes = 0, sbytes = 0, slack = 0, size;
-
- down_read(&mm->mmap_sem);
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
- vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma) {
bytes += kobjsize(vma);
region = vma->vm_region;
@@ -39,7 +38,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
}
if (atomic_read(&mm->mm_count) > 1 ||
- vma->vm_flags & VM_MAYSHARE) {
+ is_nommu_shared_mapping(vma->vm_flags)) {
sbytes += size;
} else {
bytes += size;
@@ -52,7 +51,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
sbytes += kobjsize(mm);
else
bytes += kobjsize(mm);
-
+
if (current->fs && current->fs->users > 1)
sbytes += kobjsize(current->fs);
else
@@ -63,34 +62,32 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
else
bytes += kobjsize(current->files);
- if (current->sighand && atomic_read(&current->sighand->count) > 1)
+ if (current->sighand && refcount_read(&current->sighand->count) > 1)
sbytes += kobjsize(current->sighand);
else
bytes += kobjsize(current->sighand);
bytes += kobjsize(current); /* includes kernel stack */
+ mmap_read_unlock(mm);
+
seq_printf(m,
"Mem:\t%8lu bytes\n"
"Slack:\t%8lu bytes\n"
"Shared:\t%8lu bytes\n",
bytes, slack, sbytes);
-
- up_read(&mm->mmap_sem);
}
unsigned long task_vsize(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- struct rb_node *p;
unsigned long vsize = 0;
- down_read(&mm->mmap_sem);
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
- vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma)
vsize += vma->vm_end - vma->vm_start;
- }
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return vsize;
}
@@ -98,14 +95,13 @@ unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
struct vm_region *region;
- struct rb_node *p;
unsigned long size = kobjsize(mm);
- down_read(&mm->mmap_sem);
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
- vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma) {
size += kobjsize(vma);
region = vma->vm_region;
if (region) {
@@ -118,35 +114,19 @@ unsigned long task_statm(struct mm_struct *mm,
>> PAGE_SHIFT;
*data = (PAGE_ALIGN(mm->start_stack) - (mm->start_data & PAGE_MASK))
>> PAGE_SHIFT;
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
size >>= PAGE_SHIFT;
size += *text + *data;
*resident = size;
return size;
}
-static int is_stack(struct proc_maps_private *priv,
- struct vm_area_struct *vma)
-{
- struct mm_struct *mm = vma->vm_mm;
-
- /*
- * We make no effort to guess what a given thread considers to be
- * its "stack". It's not even well-defined for programs written
- * languages like Go.
- */
- return vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack;
-}
-
/*
* display a single VMA to a sequenced file
*/
-static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
- int is_pid)
+static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
- struct proc_maps_private *priv = m->private;
unsigned long ino = 0;
struct file *file;
dev_t dev = 0;
@@ -177,10 +157,10 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
if (file) {
seq_pad(m, ' ');
- seq_file_path(m, file, "");
- } else if (mm && is_stack(priv, vma)) {
+ seq_path(m, file_user_path(file), "");
+ } else if (mm && vma_is_initial_stack(vma)) {
seq_pad(m, ' ');
- seq_printf(m, "[stack]");
+ seq_puts(m, "[stack]");
}
seq_putc(m, '\n');
@@ -190,85 +170,83 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
/*
* display mapping lines for a particular process's /proc/pid/maps
*/
-static int show_map(struct seq_file *m, void *_p, int is_pid)
+static int show_map(struct seq_file *m, void *_p)
{
- struct rb_node *p = _p;
-
- return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb),
- is_pid);
+ return nommu_vma_show(m, _p);
}
-static int show_pid_map(struct seq_file *m, void *_p)
+static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
+ loff_t *ppos)
{
- return show_map(m, _p, 1);
-}
+ struct vm_area_struct *vma = vma_next(&priv->iter);
-static int show_tid_map(struct seq_file *m, void *_p)
-{
- return show_map(m, _p, 0);
+ if (vma) {
+ *ppos = vma->vm_start;
+ } else {
+ *ppos = -1UL;
+ }
+
+ return vma;
}
-static void *m_start(struct seq_file *m, loff_t *pos)
+static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
+ unsigned long last_addr = *ppos;
struct mm_struct *mm;
- struct rb_node *p;
- loff_t n = *pos;
+
+ /* See proc_get_vma(). Zero at the start or after lseek. */
+ if (last_addr == -1UL)
+ return NULL;
/* pin the task and mm whilst we play with them */
priv->task = get_proc_task(priv->inode);
if (!priv->task)
return ERR_PTR(-ESRCH);
- mm = priv->mm;
- if (!mm || !mmget_not_zero(mm))
+ mm = priv->lock_ctx.mm;
+ if (!mm || !mmget_not_zero(mm)) {
+ put_task_struct(priv->task);
+ priv->task = NULL;
return NULL;
+ }
+
+ if (mmap_read_lock_killable(mm)) {
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
+ return ERR_PTR(-EINTR);
+ }
- down_read(&mm->mmap_sem);
- /* start from the Nth VMA */
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
- if (n-- == 0)
- return p;
+ vma_iter_init(&priv->iter, mm, last_addr);
- up_read(&mm->mmap_sem);
- mmput(mm);
- return NULL;
+ return proc_get_vma(priv, ppos);
}
-static void m_stop(struct seq_file *m, void *_vml)
+static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
+ struct mm_struct *mm = priv->lock_ctx.mm;
- if (!IS_ERR_OR_NULL(_vml)) {
- up_read(&priv->mm->mmap_sem);
- mmput(priv->mm);
- }
- if (priv->task) {
- put_task_struct(priv->task);
- priv->task = NULL;
- }
+ if (!priv->task)
+ return;
+
+ mmap_read_unlock(mm);
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
}
-static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
+static void *m_next(struct seq_file *m, void *_p, loff_t *ppos)
{
- struct rb_node *p = _p;
-
- (*pos)++;
- return p ? rb_next(p) : NULL;
+ return proc_get_vma(m->private, ppos);
}
static const struct seq_operations proc_pid_maps_ops = {
.start = m_start,
.next = m_next,
.stop = m_stop,
- .show = show_pid_map
-};
-
-static const struct seq_operations proc_tid_maps_ops = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = show_tid_map
+ .show = show_map
};
static int maps_open(struct inode *inode, struct file *file,
@@ -281,9 +259,9 @@ static int maps_open(struct inode *inode, struct file *file,
return -ENOMEM;
priv->inode = inode;
- priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(priv->mm)) {
- int err = PTR_ERR(priv->mm);
+ priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) {
+ int err = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH;
seq_release_private(inode, file);
return err;
@@ -298,8 +276,8 @@ static int map_release(struct inode *inode, struct file *file)
struct seq_file *seq = file->private_data;
struct proc_maps_private *priv = seq->private;
- if (priv->mm)
- mmdrop(priv->mm);
+ if (priv->lock_ctx.mm)
+ mmdrop(priv->lock_ctx.mm);
return seq_release_private(inode, file);
}
@@ -309,11 +287,6 @@ static int pid_maps_open(struct inode *inode, struct file *file)
return maps_open(inode, file, &proc_pid_maps_ops);
}
-static int tid_maps_open(struct inode *inode, struct file *file)
-{
- return maps_open(inode, file, &proc_tid_maps_ops);
-}
-
const struct file_operations proc_pid_maps_operations = {
.open = pid_maps_open,
.read = seq_read,
@@ -321,10 +294,3 @@ const struct file_operations proc_pid_maps_operations = {
.release = map_release,
};
-const struct file_operations proc_tid_maps_operations = {
- .open = tid_maps_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = map_release,
-};
-
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 20614b62a9b7..d6113dbe58e0 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cache.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/pid_namespace.h>
@@ -10,18 +12,17 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- struct pid_namespace *ns = inode->i_sb->s_fs_info;
+ struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
pid_t tgid = task_tgid_nr_ns(current, ns);
pid_t pid = task_pid_nr_ns(current, ns);
char *name;
if (!pid)
return ERR_PTR(-ENOENT);
- name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF,
- dentry ? GFP_KERNEL : GFP_ATOMIC);
+ name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
if (unlikely(!name))
return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
- sprintf(name, "%d/task/%d", tgid, pid);
+ sprintf(name, "%u/task/%u", tgid, pid);
set_delayed_call(done, kfree_link, name);
return name;
}
@@ -30,40 +31,35 @@ static const struct inode_operations proc_thread_self_inode_operations = {
.get_link = proc_thread_self_get_link,
};
-static unsigned thread_self_inum;
+unsigned thread_self_inum __ro_after_init;
int proc_setup_thread_self(struct super_block *s)
{
struct inode *root_inode = d_inode(s->s_root);
- struct pid_namespace *ns = s->s_fs_info;
struct dentry *thread_self;
+ int ret = -ENOMEM;
inode_lock(root_inode);
thread_self = d_alloc_name(s->s_root, "thread-self");
if (thread_self) {
- struct inode *inode = new_inode_pseudo(s);
+ struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = thread_self_inum;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ simple_inode_init_ts(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_op = &proc_thread_self_inode_operations;
- d_add(thread_self, inode);
- } else {
- dput(thread_self);
- thread_self = ERR_PTR(-ENOMEM);
+ d_make_persistent(thread_self, inode);
+ ret = 0;
}
- } else {
- thread_self = ERR_PTR(-ENOMEM);
+ dput(thread_self);
}
inode_unlock(root_inode);
- if (IS_ERR(thread_self)) {
- pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
- return PTR_ERR(thread_self);
- }
- ns->proc_thread_self = thread_self;
- return 0;
+
+ if (ret)
+ pr_err("proc_fill_super: can't allocate /proc/thread-self\n");
+ return ret;
}
void __init proc_thread_self_init(void)
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 7981c4ffe787..b5343d209381 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -1,25 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/time.h>
+#include <linux/time_namespace.h>
#include <linux/kernel_stat.h>
+#include "internal.h"
static int uptime_proc_show(struct seq_file *m, void *v)
{
- struct timespec uptime;
- struct timespec idle;
- u64 nsec;
+ struct timespec64 uptime;
+ struct timespec64 idle;
+ u64 idle_nsec;
u32 rem;
int i;
- nsec = 0;
- for_each_possible_cpu(i)
- nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
+ idle_nsec = 0;
+ for_each_possible_cpu(i) {
+ struct kernel_cpustat kcs;
- get_monotonic_boottime(&uptime);
- idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
+ kcpustat_cpu_fetch(&kcs, i);
+ idle_nsec += get_idle_time(&kcs, i);
+ }
+
+ ktime_get_boottime_ts64(&uptime);
+ timens_add_boottime(&uptime);
+
+ idle.tv_sec = div_u64_rem(idle_nsec, NSEC_PER_SEC, &rem);
idle.tv_nsec = rem;
seq_printf(m, "%lu.%02lu %lu.%02lu\n",
(unsigned long) uptime.tv_sec,
@@ -29,21 +38,12 @@ static int uptime_proc_show(struct seq_file *m, void *v)
return 0;
}
-static int uptime_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, uptime_proc_show, NULL);
-}
-
-static const struct file_operations uptime_proc_fops = {
- .open = uptime_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int __init proc_uptime_init(void)
{
- proc_create("uptime", 0, NULL, &uptime_proc_fops);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("uptime", 0, NULL, uptime_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_uptime_init);
diff --git a/fs/proc/util.c b/fs/proc/util.c
new file mode 100644
index 000000000000..98f8adc17345
--- /dev/null
+++ b/fs/proc/util.c
@@ -0,0 +1,24 @@
+#include <linux/dcache.h>
+#include "internal.h"
+
+unsigned name_to_int(const struct qstr *qstr)
+{
+ const char *name = qstr->name;
+ int len = qstr->len;
+ unsigned n = 0;
+
+ if (len > 1 && *name == '0')
+ goto out;
+ do {
+ unsigned c = *name++ - '0';
+ if (c > 9)
+ goto out;
+ if (n >= (~0U-9)/10)
+ goto out;
+ n *= 10;
+ n += c;
+ } while (--len > 0);
+ return n;
+out:
+ return ~0U;
+}
diff --git a/fs/proc/version.c b/fs/proc/version.c
index d2154eb6d78f..02e3c3cd4a9a 100644
--- a/fs/proc/version.c
+++ b/fs/proc/version.c
@@ -1,9 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/utsname.h>
+#include "internal.h"
static int version_proc_show(struct seq_file *m, void *v)
{
@@ -14,21 +16,12 @@ static int version_proc_show(struct seq_file *m, void *v)
return 0;
}
-static int version_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, version_proc_show, NULL);
-}
-
-static const struct file_operations version_proc_fops = {
- .open = version_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int __init proc_version_init(void)
{
- proc_create("version", 0, NULL, &version_proc_fops);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("version", 0, NULL, version_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_version_init);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 885d445afa0d..f188bd900eb2 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* fs/proc/vmcore.c Interface for accessing the crash
* dump from the system's previous life.
@@ -7,6 +8,8 @@
*
*/
+#define pr_fmt(fmt) "vmcore: " fmt
+
#include <linux/mm.h>
#include <linux/kcore.h>
#include <linux/user.h>
@@ -16,13 +19,16 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/printk.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/crash_dump.h>
#include <linux/list.h>
+#include <linux/moduleparam.h>
+#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
-#include <linux/uaccess.h>
+#include <linux/uio.h>
+#include <linux/cc_platform.h>
#include <asm/io.h>
#include "internal.h"
@@ -38,59 +44,125 @@ static size_t elfcorebuf_sz_orig;
static char *elfnotes_buf;
static size_t elfnotes_sz;
+/* Size of all notes minus the device dump notes */
+static size_t elfnotes_orig_sz;
/* Total size of vmcore file. */
static u64 vmcore_size;
static struct proc_dir_entry *proc_vmcore;
-/*
- * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
- * The called function has to take care of module refcounting.
- */
-static int (*oldmem_pfn_is_ram)(unsigned long pfn);
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+struct vmcoredd_node {
+ struct list_head list; /* List of dumps */
+ void *buf; /* Buffer containing device's dump */
+ unsigned int size; /* Size of the buffer */
+};
+
+/* Device Dump list and mutex to synchronize access to list */
+static LIST_HEAD(vmcoredd_list);
+
+static bool vmcoredd_disabled;
+core_param(novmcoredd, vmcoredd_disabled, bool, 0);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+/* Device Dump Size */
+static size_t vmcoredd_orig_sz;
+
+static DEFINE_MUTEX(vmcore_mutex);
+
+DEFINE_STATIC_SRCU(vmcore_cb_srcu);
+/* List of registered vmcore callbacks. */
+static LIST_HEAD(vmcore_cb_list);
+/* Whether the vmcore has been opened once. */
+static bool vmcore_opened;
+/* Whether the vmcore is currently open. */
+static unsigned int vmcore_open;
-int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn))
+static void vmcore_process_device_ram(struct vmcore_cb *cb);
+
+void register_vmcore_cb(struct vmcore_cb *cb)
{
- if (oldmem_pfn_is_ram)
- return -EBUSY;
- oldmem_pfn_is_ram = fn;
- return 0;
+ INIT_LIST_HEAD(&cb->next);
+ mutex_lock(&vmcore_mutex);
+ list_add_tail(&cb->next, &vmcore_cb_list);
+ /*
+ * Registering a vmcore callback after the vmcore was opened is
+ * very unusual (e.g., manual driver loading).
+ */
+ if (vmcore_opened)
+ pr_warn_once("Unexpected vmcore callback registration\n");
+ if (!vmcore_open && cb->get_device_ram)
+ vmcore_process_device_ram(cb);
+ mutex_unlock(&vmcore_mutex);
}
-EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram);
+EXPORT_SYMBOL_GPL(register_vmcore_cb);
-void unregister_oldmem_pfn_is_ram(void)
+void unregister_vmcore_cb(struct vmcore_cb *cb)
{
- oldmem_pfn_is_ram = NULL;
- wmb();
+ mutex_lock(&vmcore_mutex);
+ list_del_rcu(&cb->next);
+ /*
+ * Unregistering a vmcore callback after the vmcore was opened is
+ * very unusual (e.g., forced driver removal), but we cannot stop
+ * unregistering.
+ */
+ if (vmcore_opened)
+ pr_warn_once("Unexpected vmcore callback unregistration\n");
+ mutex_unlock(&vmcore_mutex);
+
+ synchronize_srcu(&vmcore_cb_srcu);
}
-EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram);
+EXPORT_SYMBOL_GPL(unregister_vmcore_cb);
-static int pfn_is_ram(unsigned long pfn)
+static bool pfn_is_ram(unsigned long pfn)
{
- int (*fn)(unsigned long pfn);
- /* pfn is ram unless fn() checks pagetype */
- int ret = 1;
+ struct vmcore_cb *cb;
+ bool ret = true;
- /*
- * Ask hypervisor if the pfn is really ram.
- * A ballooned page contains no data and reading from such a page
- * will cause high load in the hypervisor.
- */
- fn = oldmem_pfn_is_ram;
- if (fn)
- ret = fn(pfn);
+ list_for_each_entry_srcu(cb, &vmcore_cb_list, next,
+ srcu_read_lock_held(&vmcore_cb_srcu)) {
+ if (unlikely(!cb->pfn_is_ram))
+ continue;
+ ret = cb->pfn_is_ram(cb, pfn);
+ if (!ret)
+ break;
+ }
return ret;
}
+static int open_vmcore(struct inode *inode, struct file *file)
+{
+ mutex_lock(&vmcore_mutex);
+ vmcore_opened = true;
+ if (vmcore_open + 1 == 0) {
+ mutex_unlock(&vmcore_mutex);
+ return -EBUSY;
+ }
+ vmcore_open++;
+ mutex_unlock(&vmcore_mutex);
+
+ return 0;
+}
+
+static int release_vmcore(struct inode *inode, struct file *file)
+{
+ mutex_lock(&vmcore_mutex);
+ vmcore_open--;
+ mutex_unlock(&vmcore_mutex);
+
+ return 0;
+}
+
/* Reads a page from the oldmem device from given offset. */
-static ssize_t read_from_oldmem(char *buf, size_t count,
- u64 *ppos, int userbuf)
+ssize_t read_from_oldmem(struct iov_iter *iter, size_t count,
+ u64 *ppos, bool encrypted)
{
unsigned long pfn, offset;
- size_t nr_bytes;
+ ssize_t nr_bytes;
ssize_t read = 0, tmp;
+ int idx;
if (!count)
return 0;
@@ -98,6 +170,7 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
offset = (unsigned long)(*ppos % PAGE_SIZE);
pfn = (unsigned long)(*ppos / PAGE_SIZE);
+ idx = srcu_read_lock(&vmcore_cb_srcu);
do {
if (count > (PAGE_SIZE - offset))
nr_bytes = PAGE_SIZE - offset;
@@ -105,21 +178,29 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
nr_bytes = count;
/* If pfn is not ram, return zeros for sparse dump files */
- if (pfn_is_ram(pfn) == 0)
- memset(buf, 0, nr_bytes);
- else {
- tmp = copy_oldmem_page(pfn, buf, nr_bytes,
- offset, userbuf);
- if (tmp < 0)
- return tmp;
+ if (!pfn_is_ram(pfn)) {
+ tmp = iov_iter_zero(nr_bytes, iter);
+ } else {
+ if (encrypted)
+ tmp = copy_oldmem_page_encrypted(iter, pfn,
+ nr_bytes,
+ offset);
+ else
+ tmp = copy_oldmem_page(iter, pfn, nr_bytes,
+ offset);
}
+ if (tmp < nr_bytes) {
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
+ return -EFAULT;
+ }
+
*ppos += nr_bytes;
count -= nr_bytes;
- buf += nr_bytes;
read += nr_bytes;
++pfn;
offset = 0;
} while (count);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
return read;
}
@@ -143,7 +224,12 @@ void __weak elfcorehdr_free(unsigned long long addr)
*/
ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
{
- return read_from_oldmem(buf, count, ppos, 0);
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
+
+ return read_from_oldmem(&iter, count, ppos, false);
}
/*
@@ -151,7 +237,13 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
*/
ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
{
- return read_from_oldmem(buf, count, ppos, 0);
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
+
+ return read_from_oldmem(&iter, count, ppos,
+ cc_platform_has(CC_ATTR_MEM_ENCRYPT));
}
/*
@@ -161,118 +253,229 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
unsigned long from, unsigned long pfn,
unsigned long size, pgprot_t prot)
{
+ prot = pgprot_encrypted(prot);
return remap_pfn_range(vma, from, pfn, size, prot);
}
/*
- * Copy to either kernel or user space
+ * Architectures which support memory encryption override this.
*/
-static int copy_to(void *target, void *src, size_t size, int userbuf)
+ssize_t __weak copy_oldmem_page_encrypted(struct iov_iter *iter,
+ unsigned long pfn, size_t csize, unsigned long offset)
{
- if (userbuf) {
- if (copy_to_user((char __user *) target, src, size))
- return -EFAULT;
- } else {
- memcpy(target, src, size);
+ return copy_oldmem_page(iter, pfn, csize, offset);
+}
+
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+static int vmcoredd_copy_dumps(struct iov_iter *iter, u64 start, size_t size)
+{
+ struct vmcoredd_node *dump;
+ u64 offset = 0;
+ size_t tsz;
+ char *buf;
+
+ list_for_each_entry(dump, &vmcoredd_list, list) {
+ if (start < offset + dump->size) {
+ tsz = min(offset + (u64)dump->size - start, (u64)size);
+ buf = dump->buf + start - offset;
+ if (copy_to_iter(buf, tsz, iter) < tsz)
+ return -EFAULT;
+
+ size -= tsz;
+ start += tsz;
+
+ /* Leave now if buffer filled already */
+ if (!size)
+ return 0;
+ }
+ offset += dump->size;
}
+
return 0;
}
+#ifdef CONFIG_MMU
+static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst,
+ u64 start, size_t size)
+{
+ struct vmcoredd_node *dump;
+ u64 offset = 0;
+ size_t tsz;
+ char *buf;
+
+ list_for_each_entry(dump, &vmcoredd_list, list) {
+ if (start < offset + dump->size) {
+ tsz = min(offset + (u64)dump->size - start, (u64)size);
+ buf = dump->buf + start - offset;
+ if (remap_vmalloc_range_partial(vma, dst, buf, 0,
+ tsz))
+ return -EFAULT;
+
+ size -= tsz;
+ start += tsz;
+ dst += tsz;
+
+ /* Leave now if buffer filled already */
+ if (!size)
+ return 0;
+ }
+ offset += dump->size;
+ }
+
+ return 0;
+}
+#endif /* CONFIG_MMU */
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
/* Read from the ELF header and then the crash dump. On error, negative value is
* returned otherwise number of bytes read are returned.
*/
-static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
- int userbuf)
+static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos)
{
+ struct vmcore_range *m = NULL;
ssize_t acc = 0, tmp;
size_t tsz;
u64 start;
- struct vmcore *m = NULL;
- if (buflen == 0 || *fpos >= vmcore_size)
+ if (!iov_iter_count(iter) || *fpos >= vmcore_size)
return 0;
- /* trim buflen to not go beyond EOF */
- if (buflen > vmcore_size - *fpos)
- buflen = vmcore_size - *fpos;
+ iov_iter_truncate(iter, vmcore_size - *fpos);
/* Read ELF core header */
if (*fpos < elfcorebuf_sz) {
- tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
- if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf))
+ tsz = min(elfcorebuf_sz - (size_t)*fpos, iov_iter_count(iter));
+ if (copy_to_iter(elfcorebuf + *fpos, tsz, iter) < tsz)
return -EFAULT;
- buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (buflen == 0)
+ if (!iov_iter_count(iter))
return acc;
}
- /* Read Elf note segment */
+ /* Read ELF note segment */
if (*fpos < elfcorebuf_sz + elfnotes_sz) {
void *kaddr;
- tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
- kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
- if (copy_to(buffer, kaddr, tsz, userbuf))
+ /* We add device dumps before other elf notes because the
+ * other elf notes may not fill the elf notes buffer
+ * completely and we will end up with zero-filled data
+ * between the elf notes and the device dumps. Tools will
+ * then try to decode this zero-filled data as valid notes
+ * and we don't want that. Hence, adding device dumps before
+ * the other elf notes ensure that zero-filled data can be
+ * avoided.
+ */
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+ /* Read device dumps */
+ if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) {
+ tsz = min(elfcorebuf_sz + vmcoredd_orig_sz -
+ (size_t)*fpos, iov_iter_count(iter));
+ start = *fpos - elfcorebuf_sz;
+ if (vmcoredd_copy_dumps(iter, start, tsz))
+ return -EFAULT;
+
+ *fpos += tsz;
+ acc += tsz;
+
+ /* leave now if filled buffer already */
+ if (!iov_iter_count(iter))
+ return acc;
+ }
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+ /* Read remaining elf notes */
+ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos,
+ iov_iter_count(iter));
+ kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz;
+ if (copy_to_iter(kaddr, tsz, iter) < tsz)
return -EFAULT;
- buflen -= tsz;
+
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (buflen == 0)
+ if (!iov_iter_count(iter))
return acc;
+
+ cond_resched();
}
list_for_each_entry(m, &vmcore_list, list) {
if (*fpos < m->offset + m->size) {
tsz = (size_t)min_t(unsigned long long,
m->offset + m->size - *fpos,
- buflen);
+ iov_iter_count(iter));
start = m->paddr + *fpos - m->offset;
- tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
+ tmp = read_from_oldmem(iter, tsz, &start,
+ cc_platform_has(CC_ATTR_MEM_ENCRYPT));
if (tmp < 0)
return tmp;
- buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (buflen == 0)
+ if (!iov_iter_count(iter))
return acc;
}
+
+ cond_resched();
}
return acc;
}
-static ssize_t read_vmcore(struct file *file, char __user *buffer,
- size_t buflen, loff_t *fpos)
+static ssize_t read_vmcore(struct kiocb *iocb, struct iov_iter *iter)
{
- return __read_vmcore((__force char *) buffer, buflen, fpos, 1);
+ return __read_vmcore(iter, &iocb->ki_pos);
}
+/**
+ * vmcore_alloc_buf - allocate buffer in vmalloc memory
+ * @size: size of buffer
+ *
+ * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
+ * the buffer to user-space by means of remap_vmalloc_range().
+ *
+ * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
+ * disabled and there's no need to allow users to mmap the buffer.
+ */
+static inline char *vmcore_alloc_buf(size_t size)
+{
+#ifdef CONFIG_MMU
+ return vmalloc_user(size);
+#else
+ return vzalloc(size);
+#endif
+}
+
+/*
+ * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is
+ * essential for mmap_vmcore() in order to map physically
+ * non-contiguous objects (ELF header, ELF note segment and memory
+ * regions in the 1st kernel pointed to by PT_LOAD entries) into
+ * virtually contiguous user-space in ELF layout.
+ */
+#ifdef CONFIG_MMU
+
/*
* The vmcore fault handler uses the page cache and fills data using the
- * standard __vmcore_read() function.
+ * standard __read_vmcore() function.
*
* On s390 the fault handler is used for memory regions that can't be mapped
* directly with remap_pfn_range().
*/
-static int mmap_vmcore_fault(struct vm_fault *vmf)
+static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
{
#ifdef CONFIG_S390
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
pgoff_t index = vmf->pgoff;
+ struct iov_iter iter;
+ struct kvec kvec;
struct page *page;
loff_t offset;
- char *buf;
int rc;
page = find_or_create_page(mapping, index, GFP_KERNEL);
@@ -280,12 +483,15 @@ static int mmap_vmcore_fault(struct vm_fault *vmf)
return VM_FAULT_OOM;
if (!PageUptodate(page)) {
offset = (loff_t) index << PAGE_SHIFT;
- buf = __va((page_to_pfn(page) << PAGE_SHIFT));
- rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0);
+ kvec.iov_base = page_address(page);
+ kvec.iov_len = PAGE_SIZE;
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, PAGE_SIZE);
+
+ rc = __read_vmcore(&iter, &offset);
if (rc < 0) {
unlock_page(page);
put_page(page);
- return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
+ return vmf_error(rc);
}
SetPageUptodate(page);
}
@@ -301,35 +507,6 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
.fault = mmap_vmcore_fault,
};
-/**
- * alloc_elfnotes_buf - allocate buffer for ELF note segment in
- * vmalloc memory
- *
- * @notes_sz: size of buffer
- *
- * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
- * the buffer to user-space by means of remap_vmalloc_range().
- *
- * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
- * disabled and there's no need to allow users to mmap the buffer.
- */
-static inline char *alloc_elfnotes_buf(size_t notes_sz)
-{
-#ifdef CONFIG_MMU
- return vmalloc_user(notes_sz);
-#else
- return vzalloc(notes_sz);
-#endif
-}
-
-/*
- * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is
- * essential for mmap_vmcore() in order to map physically
- * non-contiguous objects (ELF header, ELF note segment and memory
- * regions in the 1st kernel pointed to by PT_LOAD entries) into
- * virtually contiguous user-space in ELF layout.
- */
-#ifdef CONFIG_MMU
/*
* remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages
* reported as not being ram with the zero page.
@@ -396,21 +573,26 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma,
unsigned long from, unsigned long pfn,
unsigned long size, pgprot_t prot)
{
+ int ret, idx;
+
/*
- * Check if oldmem_pfn_is_ram was registered to avoid
- * looping over all pages without a reason.
+ * Check if a callback was registered to avoid looping over all
+ * pages without a reason.
*/
- if (oldmem_pfn_is_ram)
- return remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
+ idx = srcu_read_lock(&vmcore_cb_srcu);
+ if (!list_empty(&vmcore_cb_list))
+ ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
else
- return remap_oldmem_pfn_range(vma, from, pfn, size, prot);
+ ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
+ return ret;
}
static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
{
size_t size = vma->vm_end - vma->vm_start;
u64 start, end, len, tsz;
- struct vmcore *m;
+ struct vmcore_range *m;
start = (u64)vma->vm_pgoff << PAGE_SHIFT;
end = start + size;
@@ -421,8 +603,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
if (vma->vm_flags & (VM_WRITE | VM_EXEC))
return -EPERM;
- vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
- vma->vm_flags |= VM_MIXEDMAP;
+ vm_flags_mod(vma, VM_MIXEDMAP, VM_MAYWRITE | VM_MAYEXEC);
vma->vm_ops = &vmcore_mmap_ops;
len = 0;
@@ -446,11 +627,46 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
if (start < elfcorebuf_sz + elfnotes_sz) {
void *kaddr;
+ /* We add device dumps before other elf notes because the
+ * other elf notes may not fill the elf notes buffer
+ * completely and we will end up with zero-filled data
+ * between the elf notes and the device dumps. Tools will
+ * then try to decode this zero-filled data as valid notes
+ * and we don't want that. Hence, adding device dumps before
+ * the other elf notes ensure that zero-filled data can be
+ * avoided. This also ensures that the device dumps and
+ * other elf notes can be properly mmaped at page aligned
+ * address.
+ */
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+ /* Read device dumps */
+ if (start < elfcorebuf_sz + vmcoredd_orig_sz) {
+ u64 start_off;
+
+ tsz = min(elfcorebuf_sz + vmcoredd_orig_sz -
+ (size_t)start, size);
+ start_off = start - elfcorebuf_sz;
+ if (vmcoredd_mmap_dumps(vma, vma->vm_start + len,
+ start_off, tsz))
+ goto fail;
+
+ size -= tsz;
+ start += tsz;
+ len += tsz;
+
+ /* leave now if filled buffer already */
+ if (!size)
+ return 0;
+ }
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+ /* Read remaining elf notes */
tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
- kaddr = elfnotes_buf + start - elfcorebuf_sz;
+ kaddr = elfnotes_buf + start - elfcorebuf_sz - vmcoredd_orig_sz;
if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
- kaddr, tsz))
+ kaddr, 0, tsz))
goto fail;
+
size -= tsz;
start += tsz;
len += tsz;
@@ -491,22 +707,19 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
}
#endif
-static const struct file_operations proc_vmcore_operations = {
- .read = read_vmcore,
- .llseek = default_llseek,
- .mmap = mmap_vmcore,
+static const struct proc_ops vmcore_proc_ops = {
+ .proc_open = open_vmcore,
+ .proc_release = release_vmcore,
+ .proc_read_iter = read_vmcore,
+ .proc_lseek = default_llseek,
+ .proc_mmap = mmap_vmcore,
};
-static struct vmcore* __init get_new_element(void)
-{
- return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
-}
-
-static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
- struct list_head *vc_list)
+static u64 get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
+ struct list_head *vc_list)
{
+ struct vmcore_range *m;
u64 size;
- struct vmcore *m;
size = elfsz + elfnotesegsz;
list_for_each_entry(m, vc_list, list) {
@@ -665,7 +878,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
return rc;
*notes_sz = roundup(phdr_sz, PAGE_SIZE);
- *notes_buf = alloc_elfnotes_buf(*notes_sz);
+ *notes_buf = vmcore_alloc_buf(*notes_sz);
if (!*notes_buf)
return -ENOMEM;
@@ -681,7 +894,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
phdr.p_offset = roundup(note_off, PAGE_SIZE);
phdr.p_vaddr = phdr.p_paddr = 0;
phdr.p_filesz = phdr.p_memsz = phdr_sz;
- phdr.p_align = 0;
+ phdr.p_align = 4;
/* Add merged PT_NOTE program header*/
tmp = elfptr + sizeof(Elf64_Ehdr);
@@ -698,6 +911,11 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
/* Modify e_phnum to reflect merged headers. */
ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
+ /* Store the size of all notes. We need this to update the note
+ * header when the device dumps will be added.
+ */
+ elfnotes_orig_sz = phdr.p_memsz;
+
return 0;
}
@@ -851,7 +1069,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
return rc;
*notes_sz = roundup(phdr_sz, PAGE_SIZE);
- *notes_buf = alloc_elfnotes_buf(*notes_sz);
+ *notes_buf = vmcore_alloc_buf(*notes_sz);
if (!*notes_buf)
return -ENOMEM;
@@ -867,7 +1085,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
phdr.p_offset = roundup(note_off, PAGE_SIZE);
phdr.p_vaddr = phdr.p_paddr = 0;
phdr.p_filesz = phdr.p_memsz = phdr_sz;
- phdr.p_align = 0;
+ phdr.p_align = 4;
/* Add merged PT_NOTE program header*/
tmp = elfptr + sizeof(Elf32_Ehdr);
@@ -884,6 +1102,11 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
/* Modify e_phnum to reflect merged headers. */
ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
+ /* Store the size of all notes. We need this to update the note
+ * header when the device dumps will be added.
+ */
+ elfnotes_orig_sz = phdr.p_memsz;
+
return 0;
}
@@ -898,12 +1121,11 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
Elf64_Ehdr *ehdr_ptr;
Elf64_Phdr *phdr_ptr;
loff_t vmcore_off;
- struct vmcore *new;
ehdr_ptr = (Elf64_Ehdr *)elfptr;
phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
- /* Skip Elf header, program headers and Elf note segment. */
+ /* Skip ELF header, program headers and ELF note segment. */
vmcore_off = elfsz + elfnotes_sz;
for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
@@ -917,13 +1139,8 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
size = end - start;
- /* Add this contiguous chunk of memory to vmcore list.*/
- new = get_new_element();
- if (!new)
+ if (vmcore_alloc_add_range(vc_list, start, size))
return -ENOMEM;
- new->paddr = start;
- new->size = size;
- list_add_tail(&new->list, vc_list);
/* Update the program header offset. */
phdr_ptr->p_offset = vmcore_off + (paddr - start);
@@ -941,12 +1158,11 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
Elf32_Ehdr *ehdr_ptr;
Elf32_Phdr *phdr_ptr;
loff_t vmcore_off;
- struct vmcore *new;
ehdr_ptr = (Elf32_Ehdr *)elfptr;
phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
- /* Skip Elf header, program headers and Elf note segment. */
+ /* Skip ELF header, program headers and ELF note segment. */
vmcore_off = elfsz + elfnotes_sz;
for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
@@ -960,13 +1176,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
size = end - start;
- /* Add this contiguous chunk of memory to vmcore list.*/
- new = get_new_element();
- if (!new)
+ if (vmcore_alloc_add_range(vc_list, start, size))
return -ENOMEM;
- new->paddr = start;
- new->size = size;
- list_add_tail(&new->list, vc_list);
/* Update the program header offset */
phdr_ptr->p_offset = vmcore_off + (paddr - start);
@@ -976,13 +1187,13 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
}
/* Sets offset fields of vmcore elements. */
-static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
- struct list_head *vc_list)
+static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
+ struct list_head *vc_list)
{
+ struct vmcore_range *m;
loff_t vmcore_off;
- struct vmcore *m;
- /* Skip Elf header, program headers and Elf note segment. */
+ /* Skip ELF header, program headers and ELF note segment. */
vmcore_off = elfsz + elfnotes_sz;
list_for_each_entry(m, vc_list, list) {
@@ -1007,7 +1218,7 @@ static int __init parse_crash_elf64_headers(void)
addr = elfcorehdr_addr;
- /* Read Elf header */
+ /* Read ELF header */
rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr);
if (rc < 0)
return rc;
@@ -1063,7 +1274,7 @@ static int __init parse_crash_elf32_headers(void)
addr = elfcorehdr_addr;
- /* Read Elf header */
+ /* Read ELF header */
rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr);
if (rc < 0)
return rc;
@@ -1145,6 +1356,359 @@ static int __init parse_crash_elf_headers(void)
return 0;
}
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+/**
+ * vmcoredd_write_header - Write vmcore device dump header at the
+ * beginning of the dump's buffer.
+ * @buf: Output buffer where the note is written
+ * @data: Dump info
+ * @size: Size of the dump
+ *
+ * Fills beginning of the dump's buffer with vmcore device dump header.
+ */
+static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data,
+ u32 size)
+{
+ struct vmcoredd_header *vdd_hdr = (struct vmcoredd_header *)buf;
+
+ vdd_hdr->n_namesz = sizeof(vdd_hdr->name);
+ vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name);
+ vdd_hdr->n_type = NT_VMCOREDD;
+
+ strscpy_pad(vdd_hdr->name, VMCOREDD_NOTE_NAME);
+ strscpy_pad(vdd_hdr->dump_name, data->dump_name);
+}
+
+/**
+ * vmcoredd_update_program_headers - Update all ELF program headers
+ * @elfptr: Pointer to elf header
+ * @elfnotesz: Size of elf notes aligned to page size
+ * @vmcoreddsz: Size of device dumps to be added to elf note header
+ *
+ * Determine type of ELF header (Elf64 or Elf32) and update the elf note size.
+ * Also update the offsets of all the program headers after the elf note header.
+ */
+static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz,
+ size_t vmcoreddsz)
+{
+ unsigned char *e_ident = (unsigned char *)elfptr;
+ u64 start, end, size;
+ loff_t vmcore_off;
+ u32 i;
+
+ vmcore_off = elfcorebuf_sz + elfnotesz;
+
+ if (e_ident[EI_CLASS] == ELFCLASS64) {
+ Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfptr;
+ Elf64_Phdr *phdr = (Elf64_Phdr *)(elfptr + sizeof(Elf64_Ehdr));
+
+ /* Update all program headers */
+ for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+ if (phdr->p_type == PT_NOTE) {
+ /* Update note size */
+ phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz;
+ phdr->p_filesz = phdr->p_memsz;
+ continue;
+ }
+
+ start = rounddown(phdr->p_offset, PAGE_SIZE);
+ end = roundup(phdr->p_offset + phdr->p_memsz,
+ PAGE_SIZE);
+ size = end - start;
+ phdr->p_offset = vmcore_off + (phdr->p_offset - start);
+ vmcore_off += size;
+ }
+ } else {
+ Elf32_Ehdr *ehdr = (Elf32_Ehdr *)elfptr;
+ Elf32_Phdr *phdr = (Elf32_Phdr *)(elfptr + sizeof(Elf32_Ehdr));
+
+ /* Update all program headers */
+ for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+ if (phdr->p_type == PT_NOTE) {
+ /* Update note size */
+ phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz;
+ phdr->p_filesz = phdr->p_memsz;
+ continue;
+ }
+
+ start = rounddown(phdr->p_offset, PAGE_SIZE);
+ end = roundup(phdr->p_offset + phdr->p_memsz,
+ PAGE_SIZE);
+ size = end - start;
+ phdr->p_offset = vmcore_off + (phdr->p_offset - start);
+ vmcore_off += size;
+ }
+ }
+}
+
+/**
+ * vmcoredd_update_size - Update the total size of the device dumps and update
+ * ELF header
+ * @dump_size: Size of the current device dump to be added to total size
+ *
+ * Update the total size of all the device dumps and update the ELF program
+ * headers. Calculate the new offsets for the vmcore list and update the
+ * total vmcore size.
+ */
+static void vmcoredd_update_size(size_t dump_size)
+{
+ vmcoredd_orig_sz += dump_size;
+ elfnotes_sz = roundup(elfnotes_orig_sz, PAGE_SIZE) + vmcoredd_orig_sz;
+ vmcoredd_update_program_headers(elfcorebuf, elfnotes_sz,
+ vmcoredd_orig_sz);
+
+ /* Update vmcore list offsets */
+ set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
+
+ vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
+ &vmcore_list);
+ proc_vmcore->size = vmcore_size;
+}
+
+/**
+ * vmcore_add_device_dump - Add a buffer containing device dump to vmcore
+ * @data: dump info.
+ *
+ * Allocate a buffer and invoke the calling driver's dump collect routine.
+ * Write ELF note at the beginning of the buffer to indicate vmcore device
+ * dump and add the dump to global list.
+ */
+int vmcore_add_device_dump(struct vmcoredd_data *data)
+{
+ struct vmcoredd_node *dump;
+ void *buf = NULL;
+ size_t data_size;
+ int ret;
+
+ if (vmcoredd_disabled) {
+ pr_err_once("Device dump is disabled\n");
+ return -EINVAL;
+ }
+
+ if (!data || !strlen(data->dump_name) ||
+ !data->vmcoredd_callback || !data->size)
+ return -EINVAL;
+
+ dump = vzalloc(sizeof(*dump));
+ if (!dump)
+ return -ENOMEM;
+
+ /* Keep size of the buffer page aligned so that it can be mmaped */
+ data_size = roundup(sizeof(struct vmcoredd_header) + data->size,
+ PAGE_SIZE);
+
+ /* Allocate buffer for driver's to write their dumps */
+ buf = vmcore_alloc_buf(data_size);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ vmcoredd_write_header(buf, data, data_size -
+ sizeof(struct vmcoredd_header));
+
+ /* Invoke the driver's dump collection routing */
+ ret = data->vmcoredd_callback(data, buf +
+ sizeof(struct vmcoredd_header));
+ if (ret)
+ goto out_err;
+
+ dump->buf = buf;
+ dump->size = data_size;
+
+ /* Add the dump to driver sysfs list and update the elfcore hdr */
+ scoped_guard(mutex, &vmcore_mutex) {
+ if (vmcore_opened)
+ pr_warn_once("Unexpected adding of device dump\n");
+ if (vmcore_open) {
+ ret = -EBUSY;
+ goto out_err;
+ }
+
+ list_add_tail(&dump->list, &vmcoredd_list);
+ vmcoredd_update_size(data_size);
+ }
+ return 0;
+
+out_err:
+ vfree(buf);
+ vfree(dump);
+
+ return ret;
+}
+EXPORT_SYMBOL(vmcore_add_device_dump);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+#ifdef CONFIG_PROC_VMCORE_DEVICE_RAM
+static int vmcore_realloc_elfcore_buffer_elf64(size_t new_size)
+{
+ char *elfcorebuf_new;
+
+ if (WARN_ON_ONCE(new_size < elfcorebuf_sz))
+ return -EINVAL;
+ if (get_order(elfcorebuf_sz_orig) == get_order(new_size)) {
+ elfcorebuf_sz_orig = new_size;
+ return 0;
+ }
+
+ elfcorebuf_new = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(new_size));
+ if (!elfcorebuf_new)
+ return -ENOMEM;
+ memcpy(elfcorebuf_new, elfcorebuf, elfcorebuf_sz);
+ free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig));
+ elfcorebuf = elfcorebuf_new;
+ elfcorebuf_sz_orig = new_size;
+ return 0;
+}
+
+static void vmcore_reset_offsets_elf64(void)
+{
+ Elf64_Phdr *phdr_start = (Elf64_Phdr *)(elfcorebuf + sizeof(Elf64_Ehdr));
+ loff_t vmcore_off = elfcorebuf_sz + elfnotes_sz;
+ Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfcorebuf;
+ Elf64_Phdr *phdr;
+ int i;
+
+ for (i = 0, phdr = phdr_start; i < ehdr->e_phnum; i++, phdr++) {
+ u64 start, end;
+
+ /*
+ * After merge_note_headers_elf64() we should only have a single
+ * PT_NOTE entry that starts immediately after elfcorebuf_sz.
+ */
+ if (phdr->p_type == PT_NOTE) {
+ phdr->p_offset = elfcorebuf_sz;
+ continue;
+ }
+
+ start = rounddown(phdr->p_offset, PAGE_SIZE);
+ end = roundup(phdr->p_offset + phdr->p_memsz, PAGE_SIZE);
+ phdr->p_offset = vmcore_off + (phdr->p_offset - start);
+ vmcore_off = vmcore_off + end - start;
+ }
+ set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
+}
+
+static int vmcore_add_device_ram_elf64(struct list_head *list, size_t count)
+{
+ Elf64_Phdr *phdr_start = (Elf64_Phdr *)(elfcorebuf + sizeof(Elf64_Ehdr));
+ Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfcorebuf;
+ struct vmcore_range *cur;
+ Elf64_Phdr *phdr;
+ size_t new_size;
+ int rc;
+
+ if ((Elf32_Half)(ehdr->e_phnum + count) != ehdr->e_phnum + count) {
+ pr_err("too many device ram ranges\n");
+ return -ENOSPC;
+ }
+
+ /* elfcorebuf_sz must always cover full pages. */
+ new_size = sizeof(Elf64_Ehdr) +
+ (ehdr->e_phnum + count) * sizeof(Elf64_Phdr);
+ new_size = roundup(new_size, PAGE_SIZE);
+
+ /*
+ * Make sure we have sufficient space to include the new PT_LOAD
+ * entries.
+ */
+ rc = vmcore_realloc_elfcore_buffer_elf64(new_size);
+ if (rc) {
+ pr_err("resizing elfcore failed\n");
+ return rc;
+ }
+
+ /* Modify our used elfcore buffer size to cover the new entries. */
+ elfcorebuf_sz = new_size;
+
+ /* Fill the added PT_LOAD entries. */
+ phdr = phdr_start + ehdr->e_phnum;
+ list_for_each_entry(cur, list, list) {
+ WARN_ON_ONCE(!IS_ALIGNED(cur->paddr | cur->size, PAGE_SIZE));
+ elfcorehdr_fill_device_ram_ptload_elf64(phdr, cur->paddr, cur->size);
+
+ /* p_offset will be adjusted later. */
+ phdr++;
+ ehdr->e_phnum++;
+ }
+ list_splice_tail(list, &vmcore_list);
+
+ /* We changed elfcorebuf_sz and added new entries; reset all offsets. */
+ vmcore_reset_offsets_elf64();
+
+ /* Finally, recalculate the total vmcore size. */
+ vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
+ &vmcore_list);
+ proc_vmcore->size = vmcore_size;
+ return 0;
+}
+
+static void vmcore_process_device_ram(struct vmcore_cb *cb)
+{
+ unsigned char *e_ident = (unsigned char *)elfcorebuf;
+ struct vmcore_range *first, *m;
+ LIST_HEAD(list);
+ int count;
+
+ /* We only support Elf64 dumps for now. */
+ if (WARN_ON_ONCE(e_ident[EI_CLASS] != ELFCLASS64)) {
+ pr_err("device ram ranges only support Elf64\n");
+ return;
+ }
+
+ if (cb->get_device_ram(cb, &list)) {
+ pr_err("obtaining device ram ranges failed\n");
+ return;
+ }
+ count = list_count_nodes(&list);
+ if (!count)
+ return;
+
+ /*
+ * For some reason these ranges are already know? Might happen
+ * with unusual register->unregister->register sequences; we'll simply
+ * sanity check using the first range.
+ */
+ first = list_first_entry(&list, struct vmcore_range, list);
+ list_for_each_entry(m, &vmcore_list, list) {
+ unsigned long long m_end = m->paddr + m->size;
+ unsigned long long first_end = first->paddr + first->size;
+
+ if (first->paddr < m_end && m->paddr < first_end)
+ goto out_free;
+ }
+
+ /* If adding the mem nodes succeeds, they must not be freed. */
+ if (!vmcore_add_device_ram_elf64(&list, count))
+ return;
+out_free:
+ vmcore_free_ranges(&list);
+}
+#else /* !CONFIG_PROC_VMCORE_DEVICE_RAM */
+static void vmcore_process_device_ram(struct vmcore_cb *cb)
+{
+}
+#endif /* CONFIG_PROC_VMCORE_DEVICE_RAM */
+
+/* Free all dumps in vmcore device dump list */
+static void vmcore_free_device_dumps(void)
+{
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+ mutex_lock(&vmcore_mutex);
+ while (!list_empty(&vmcoredd_list)) {
+ struct vmcoredd_node *dump;
+
+ dump = list_first_entry(&vmcoredd_list, struct vmcoredd_node,
+ list);
+ list_del(&dump->list);
+ vfree(dump->buf);
+ vfree(dump);
+ }
+ mutex_unlock(&vmcore_mutex);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+}
+
/* Init function for vmcore module. */
static int __init vmcore_init(void)
{
@@ -1162,13 +1726,14 @@ static int __init vmcore_init(void)
return rc;
rc = parse_crash_elf_headers();
if (rc) {
- pr_warn("Kdump: vmcore not initialized\n");
+ elfcorehdr_free(elfcorehdr_addr);
+ pr_warn("not initialized\n");
return rc;
}
elfcorehdr_free(elfcorehdr_addr);
elfcorehdr_addr = ELFCORE_ADDR_ERR;
- proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
+ proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &vmcore_proc_ops);
if (proc_vmcore)
proc_vmcore->size = vmcore_size;
return 0;
@@ -1178,20 +1743,14 @@ fs_initcall(vmcore_init);
/* Cleanup function for vmcore module. */
void vmcore_cleanup(void)
{
- struct list_head *pos, *next;
-
if (proc_vmcore) {
proc_remove(proc_vmcore);
proc_vmcore = NULL;
}
- /* clear the vmcore list. */
- list_for_each_safe(pos, next, &vmcore_list) {
- struct vmcore *m;
-
- m = list_entry(pos, struct vmcore, list);
- list_del(&m->list);
- kfree(m);
- }
+ vmcore_free_ranges(&vmcore_list);
free_elfcorebuf();
+
+ /* clear vmcore device dump list */
+ vmcore_free_device_dumps();
}