summaryrefslogtreecommitdiff
path: root/fs/proc
diff options
context:
space:
mode:
Diffstat (limited to 'fs/proc')
-rw-r--r--fs/proc/Kconfig22
-rw-r--r--fs/proc/Makefile2
-rw-r--r--fs/proc/array.c180
-rw-r--r--fs/proc/base.c577
-rw-r--r--fs/proc/bootconfig.c23
-rw-r--r--fs/proc/cmdline.c7
-rw-r--r--fs/proc/consoles.c28
-rw-r--r--fs/proc/cpuinfo.c8
-rw-r--r--fs/proc/devices.c7
-rw-r--r--fs/proc/fd.c163
-rw-r--r--fs/proc/fd.h3
-rw-r--r--fs/proc/generic.c126
-rw-r--r--fs/proc/inode.c244
-rw-r--r--fs/proc/internal.h154
-rw-r--r--fs/proc/interrupts.c4
-rw-r--r--fs/proc/kcore.c303
-rw-r--r--fs/proc/kmsg.c3
-rw-r--r--fs/proc/loadavg.c8
-rw-r--r--fs/proc/meminfo.c52
-rw-r--r--fs/proc/namespaces.c11
-rw-r--r--fs/proc/nommu.c3
-rw-r--r--fs/proc/page.c292
-rw-r--r--fs/proc/proc_net.c68
-rw-r--r--fs/proc/proc_sysctl.c838
-rw-r--r--fs/proc/proc_tty.c2
-rw-r--r--fs/proc/root.c138
-rw-r--r--fs/proc/self.c12
-rw-r--r--fs/proc/softirqs.c8
-rw-r--r--fs/proc/stat.c41
-rw-r--r--fs/proc/task_mmu.c2280
-rw-r--r--fs/proc/task_nommu.c122
-rw-r--r--fs/proc/thread_self.c13
-rw-r--r--fs/proc/uptime.c20
-rw-r--r--fs/proc/version.c6
-rw-r--r--fs/proc/vmcore.c613
35 files changed, 4287 insertions, 2094 deletions
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index c930001056f9..6ae966c561e7 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -32,7 +32,7 @@ config PROC_FS
config PROC_KCORE
bool "/proc/kcore support" if !ARM
depends on PROC_FS && MMU
- select CRASH_CORE
+ select VMCORE_INFO
help
Provides a virtual ELF core file of the live kernel. This can
be read with gdb and other ELF tools. No modifications can be
@@ -61,6 +61,25 @@ config PROC_VMCORE_DEVICE_DUMP
as ELF notes to /proc/vmcore. You can still disable device
dump using the kernel command line option 'novmcoredd'.
+config NEED_PROC_VMCORE_DEVICE_RAM
+ bool
+
+config PROC_VMCORE_DEVICE_RAM
+ def_bool y
+ depends on PROC_VMCORE && NEED_PROC_VMCORE_DEVICE_RAM
+ depends on VIRTIO_MEM
+ help
+ If the elfcore hdr is allocated and prepared by the dump kernel
+ ("2nd kernel") instead of the crashed kernel, RAM provided by memory
+ devices such as virtio-mem will not be included in the dump
+ image, because only the device driver can properly detect them.
+
+ With this config enabled, these RAM ranges will be queried from the
+ device drivers once the device gets probed, so they can be included
+ in the crash dump.
+
+ Relevant architectures should select NEED_PROC_VMCORE_DEVICE_RAM.
+
config PROC_SYSCTL
bool "Sysctl support (/proc/sys)" if EXPERT
depends on PROC_FS
@@ -92,6 +111,7 @@ config PROC_PAGE_MONITOR
config PROC_CHILDREN
bool "Include /proc/<pid>/task/<tid>/children file"
+ depends on PROC_FS
default n
help
Provides a fast way to retrieve first level children pids of a task. See
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index bd08616ed8ba..7b4db9c56e6a 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -5,7 +5,7 @@
obj-y += proc.o
-CFLAGS_task_mmu.o += $(call cc-option,-Wno-override-init,)
+CFLAGS_task_mmu.o += -Wno-override-init
proc-y := nommu.o task_nommu.o
proc-$(CONFIG_MMU) := task_mmu.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 55ecbeb3a721..42932f88141a 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -56,6 +56,7 @@
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/time.h>
+#include <linux/time_namespace.h>
#include <linux/kernel.h>
#include <linux/kernel_stat.h>
#include <linux/tty.h>
@@ -68,7 +69,6 @@
#include <linux/sched/cputime.h>
#include <linux/proc_fs.h>
#include <linux/ioport.h>
-#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
@@ -87,37 +87,34 @@
#include <linux/pid_namespace.h>
#include <linux/prctl.h>
#include <linux/ptrace.h>
-#include <linux/tracehook.h>
#include <linux/string_helpers.h>
#include <linux/user_namespace.h>
#include <linux/fs_struct.h>
+#include <linux/kthread.h>
+#include <linux/mmu_context.h>
#include <asm/processor.h>
#include "internal.h"
void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
{
- char *buf;
- size_t size;
char tcomm[64];
- int ret;
+ /*
+ * Test before PF_KTHREAD because all workqueue worker threads are
+ * kernel threads.
+ */
if (p->flags & PF_WQ_WORKER)
wq_worker_comm(tcomm, sizeof(tcomm), p);
+ else if (p->flags & PF_KTHREAD)
+ get_kthread_comm(tcomm, sizeof(tcomm), p);
else
- __get_task_comm(tcomm, sizeof(tcomm), p);
-
- size = seq_get_buf(m, &buf);
- if (escape) {
- ret = string_escape_str(tcomm, buf, size,
- ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
- if (ret >= size)
- ret = -1;
- } else {
- ret = strscpy(buf, tcomm, size);
- }
+ get_task_comm(tcomm, p);
- seq_commit(m, ret);
+ if (escape)
+ seq_escape_str(m, tcomm, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ else
+ seq_printf(m, "%.64s", tcomm);
}
/*
@@ -160,13 +157,11 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
unsigned int max_fds = 0;
rcu_read_lock();
- ppid = pid_alive(p) ?
- task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
-
tracer = ptrace_parent(p);
if (tracer)
tpid = task_pid_nr_ns(tracer, ns);
+ ppid = task_ppid_nr_ns(p, ns);
tgid = task_tgid_nr_ns(p, ns);
ngid = task_numa_group_id(p);
cred = get_task_cred(p);
@@ -223,6 +218,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns));
#endif
seq_putc(m, '\n');
+
+ seq_printf(m, "Kthread:\t%c\n", p->flags & PF_KTHREAD ? '1' : '0');
}
void render_sigset_t(struct seq_file *m, const char *header,
@@ -283,7 +280,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
collect_sigign_sigcatch(p, &ignored, &caught);
num_threads = get_nr_threads(p);
rcu_read_lock(); /* FIXME: is this correct? */
- qsize = atomic_read(&__task_cred(p)->user->sigpending);
+ qsize = get_rlimit_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING);
rcu_read_unlock();
qlim = task_rlimit(p, RLIMIT_SIGPENDING);
unlock_task_sighand(p, &flags);
@@ -304,13 +301,8 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
static void render_cap_t(struct seq_file *m, const char *header,
kernel_cap_t *a)
{
- unsigned __capi;
-
seq_puts(m, header);
- CAP_FOR_EACH_U32(__capi) {
- seq_put_hex_ll(m, NULL,
- a->cap[CAP_LAST_U32 - __capi], 8);
- }
+ seq_put_hex_ll(m, NULL, a->val, 16);
seq_putc(m, '\n');
}
@@ -341,6 +333,10 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p));
#ifdef CONFIG_SECCOMP
seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
+#ifdef CONFIG_SECCOMP_FILTER
+ seq_put_decimal_ull(m, "\nSeccomp_filters:\t",
+ atomic_read(&p->seccomp.filter_count));
+#endif
#endif
seq_puts(m, "\nSpeculation_Store_Bypass:\t");
switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
@@ -366,6 +362,34 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
seq_puts(m, "vulnerable");
break;
}
+
+ seq_puts(m, "\nSpeculationIndirectBranch:\t");
+ switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_INDIRECT_BRANCH)) {
+ case -EINVAL:
+ seq_puts(m, "unsupported");
+ break;
+ case PR_SPEC_NOT_AFFECTED:
+ seq_puts(m, "not affected");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE:
+ seq_puts(m, "conditional force disabled");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
+ seq_puts(m, "conditional disabled");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_ENABLE:
+ seq_puts(m, "conditional enabled");
+ break;
+ case PR_SPEC_ENABLE:
+ seq_puts(m, "always enabled");
+ break;
+ case PR_SPEC_DISABLE:
+ seq_puts(m, "always disabled");
+ break;
+ default:
+ seq_puts(m, "unknown");
+ break;
+ }
seq_putc(m, '\n');
}
@@ -380,14 +404,14 @@ static inline void task_context_switch_counts(struct seq_file *m,
static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
{
seq_printf(m, "Cpus_allowed:\t%*pb\n",
- cpumask_pr_args(task->cpus_ptr));
+ cpumask_pr_args(&task->cpus_mask));
seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
- cpumask_pr_args(task->cpus_ptr));
+ cpumask_pr_args(&task->cpus_mask));
}
-static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
+static inline void task_core_dumping(struct seq_file *m, struct task_struct *task)
{
- seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state);
+ seq_put_decimal_ull(m, "CoreDumping:\t", !!task->signal->core_state);
seq_putc(m, '\n');
}
@@ -396,10 +420,20 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm)
bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE);
if (thp_enabled)
- thp_enabled = !test_bit(MMF_DISABLE_THP, &mm->flags);
+ thp_enabled = !mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
seq_printf(m, "THP_enabled:\t%d\n", thp_enabled);
}
+static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm)
+{
+ seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm));
+}
+
+__weak void arch_proc_pid_thread_features(struct seq_file *m,
+ struct task_struct *task)
+{
+}
+
int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
@@ -413,8 +447,9 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
if (mm) {
task_mem(m, mm);
- task_core_dumping(m, mm);
+ task_core_dumping(m, task);
task_thp_status(m, mm);
+ task_untag_mask(m, mm);
mmput(mm);
}
task_sig(m, task);
@@ -423,6 +458,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
task_cpus_allowed(m, task);
cpuset_task_status_allowed(m, task);
task_context_switch_counts(m, task);
+ arch_proc_pid_thread_features(m, task);
return 0;
}
@@ -439,12 +475,12 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
int permitted;
struct mm_struct *mm;
unsigned long long start_time;
- unsigned long cmin_flt = 0, cmaj_flt = 0;
- unsigned long min_flt = 0, maj_flt = 0;
- u64 cutime, cstime, utime, stime;
- u64 cgtime, gtime;
+ unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt;
+ u64 cutime, cstime, cgtime, utime, stime, gtime;
unsigned long rsslim = 0;
unsigned long flags;
+ int exit_code = task->exit_code;
+ struct signal_struct *sig = task->signal;
state = *get_task_state(task);
vsize = eip = esp = 0;
@@ -461,7 +497,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
* a program is not able to use ptrace(2) in that case. It is
* safe because the task has stopped executing permanently.
*/
- if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) {
+ if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE|PF_POSTCOREDUMP))) {
if (try_get_task_stack(task)) {
eip = KSTK_EIP(task);
esp = KSTK_ESP(task);
@@ -472,12 +508,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
sigemptyset(&sigign);
sigemptyset(&sigcatch);
- cutime = cstime = utime = stime = 0;
- cgtime = gtime = 0;
if (lock_task_sighand(task, &flags)) {
- struct signal_struct *sig = task->signal;
-
if (sig->tty) {
struct pid *pgrp = tty_get_pgrp(sig->tty);
tty_pgrp = pid_nr_ns(pgrp, ns);
@@ -488,26 +520,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
num_threads = get_nr_threads(task);
collect_sigign_sigcatch(task, &sigign, &sigcatch);
- cmin_flt = sig->cmin_flt;
- cmaj_flt = sig->cmaj_flt;
- cutime = sig->cutime;
- cstime = sig->cstime;
- cgtime = sig->cgtime;
rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
- /* add up live thread stats at the group level */
if (whole) {
- struct task_struct *t = task;
- do {
- min_flt += t->min_flt;
- maj_flt += t->maj_flt;
- gtime += task_gtime(t);
- } while_each_thread(task, t);
-
- min_flt += sig->min_flt;
- maj_flt += sig->maj_flt;
- thread_group_cputime_adjusted(task, &utime, &stime);
- gtime += sig->gtime;
+ if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED))
+ exit_code = sig->group_exit_code;
}
sid = task_session_nr_ns(task, ns);
@@ -518,11 +535,38 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
}
if (permitted && (!whole || num_threads < 2))
- wchan = get_wchan(task);
- if (!whole) {
+ wchan = !task_is_running(task);
+
+ scoped_guard(rcu) {
+ scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) {
+ cmin_flt = sig->cmin_flt;
+ cmaj_flt = sig->cmaj_flt;
+ cutime = sig->cutime;
+ cstime = sig->cstime;
+ cgtime = sig->cgtime;
+
+ if (whole) {
+ struct task_struct *t;
+
+ min_flt = sig->min_flt;
+ maj_flt = sig->maj_flt;
+ gtime = sig->gtime;
+
+ __for_each_thread(sig, t) {
+ min_flt += t->min_flt;
+ maj_flt += t->maj_flt;
+ gtime += task_gtime(t);
+ }
+ }
+ }
+ }
+
+ if (whole) {
+ thread_group_cputime_adjusted(task, &utime, &stime);
+ } else {
+ task_cputime_adjusted(task, &utime, &stime);
min_flt = task->min_flt;
maj_flt = task->maj_flt;
- task_cputime_adjusted(task, &utime, &stime);
gtime = task_gtime(task);
}
@@ -531,8 +575,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
priority = task_prio(task);
nice = task_nice(task);
- /* convert nsec -> ticks */
- start_time = nsec_to_clock_t(task->start_boottime);
+ /* apply timens offset for boottime and convert nsec -> ticks */
+ start_time =
+ nsec_to_clock_t(timens_add_boottime_ns(task->start_boottime));
seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns));
seq_puts(m, " (");
@@ -582,10 +627,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
*
* This works with older implementations of procps as well.
*/
- if (wchan)
- seq_puts(m, " 1");
- else
- seq_puts(m, " 0");
+ seq_put_decimal_ull(m, " ", wchan);
seq_put_decimal_ull(m, " ", 0);
seq_put_decimal_ull(m, " ", 0);
@@ -609,7 +651,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
seq_puts(m, " 0 0 0 0 0 0 0");
if (permitted)
- seq_put_decimal_ll(m, " ", task->exit_code);
+ seq_put_decimal_ll(m, " ", exit_code);
else
seq_puts(m, " 0");
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d86c0afc8a85..4eec684baca9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -58,7 +58,6 @@
#include <linux/init.h>
#include <linux/capability.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/generic-radix-tree.h>
#include <linux/string.h>
#include <linux/seq_file.h>
@@ -74,7 +73,6 @@
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/ptrace.h>
-#include <linux/tracehook.h>
#include <linux/printk.h>
#include <linux/cache.h>
#include <linux/cgroup.h>
@@ -86,6 +84,7 @@
#include <linux/elf.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
+#include <linux/fs_parser.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
@@ -96,6 +95,9 @@
#include <linux/posix-timers.h>
#include <linux/time_namespace.h>
#include <linux/resctrl.h>
+#include <linux/cn_proc.h>
+#include <linux/ksm.h>
+#include <uapi/linux/lsm.h>
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"
@@ -115,6 +117,40 @@
static u8 nlink_tid __ro_after_init;
static u8 nlink_tgid __ro_after_init;
+enum proc_mem_force {
+ PROC_MEM_FORCE_ALWAYS,
+ PROC_MEM_FORCE_PTRACE,
+ PROC_MEM_FORCE_NEVER
+};
+
+static enum proc_mem_force proc_mem_force_override __ro_after_init =
+ IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER :
+ IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE :
+ PROC_MEM_FORCE_ALWAYS;
+
+static const struct constant_table proc_mem_force_table[] __initconst = {
+ { "always", PROC_MEM_FORCE_ALWAYS },
+ { "ptrace", PROC_MEM_FORCE_PTRACE },
+ { "never", PROC_MEM_FORCE_NEVER },
+ { }
+};
+
+static int __init early_proc_mem_force_override(char *buf)
+{
+ if (!buf)
+ return -EINVAL;
+
+ /*
+ * lookup_constant() defaults to proc_mem_force_override to preseve
+ * the initial Kconfig choice in case an invalid param gets passed.
+ */
+ proc_mem_force_override = lookup_constant(proc_mem_force_table,
+ buf, proc_mem_force_override);
+
+ return 0;
+}
+early_param("proc_mem.force_override", early_proc_mem_force_override);
+
struct pid_entry {
const char *name;
unsigned int len;
@@ -145,10 +181,10 @@ struct pid_entry {
NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_single_file_operations, \
{ .proc_show = show } )
-#define ATTR(LSM, NAME, MODE) \
+#define ATTR(LSMID, NAME, MODE) \
NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_pid_attr_operations, \
- { .lsm = LSM })
+ { .lsmid = LSMID })
/*
* Count the number of hardlinks for the pid_entry table, excluding the .
@@ -380,7 +416,7 @@ static const struct file_operations proc_pid_cmdline_ops = {
#ifdef CONFIG_KALLSYMS
/*
* Provides a wchan file via kallsyms in a proper one-value-per-file format.
- * Returns the resolved symbol. If that fails, simply return the address.
+ * Returns the resolved symbol to user space.
*/
static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
@@ -405,11 +441,11 @@ print0:
static int lock_trace(struct task_struct *task)
{
- int err = mutex_lock_killable(&task->signal->exec_update_mutex);
+ int err = down_read_killable(&task->signal->exec_update_lock);
if (err)
return err;
if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
- mutex_unlock(&task->signal->exec_update_mutex);
+ up_read(&task->signal->exec_update_lock);
return -EPERM;
}
return 0;
@@ -417,7 +453,7 @@ static int lock_trace(struct task_struct *task)
static void unlock_trace(struct task_struct *task)
{
- mutex_unlock(&task->signal->exec_update_mutex);
+ up_read(&task->signal->exec_update_lock);
}
#ifdef CONFIG_STACKTRACE
@@ -551,8 +587,17 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
{
unsigned long totalpages = totalram_pages() + total_swap_pages;
unsigned long points = 0;
+ long badness;
+
+ badness = oom_badness(task, totalpages);
+ /*
+ * Special case OOM_SCORE_ADJ_MIN for all others scale the
+ * badness value into [0, 2000] range which we have been
+ * exporting for a long time so userspace might depend on it.
+ */
+ if (badness != LONG_MIN)
+ points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3;
- points = oom_badness(task, totalpages) * 1000 / totalpages;
seq_printf(m, "%lu\n", points);
return 0;
@@ -660,10 +705,10 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
/************************************************************************/
/* permission checks */
-static int proc_fd_access_allowed(struct inode *inode)
+static bool proc_fd_access_allowed(struct inode *inode)
{
struct task_struct *task;
- int allowed = 0;
+ bool allowed = false;
/* Allow access to a task's file descriptors if it is us or we
* may use ptrace attach to the process and find out that
* information.
@@ -676,7 +721,8 @@ static int proc_fd_access_allowed(struct inode *inode)
return allowed;
}
-int proc_setattr(struct dentry *dentry, struct iattr *attr)
+int proc_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
{
int error;
struct inode *inode = d_inode(dentry);
@@ -684,12 +730,11 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid & ATTR_MODE)
return -EPERM;
- error = setattr_prepare(dentry, attr);
+ error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
if (error)
return error;
- setattr_copy(inode, attr);
- mark_inode_dirty(inode);
+ setattr_copy(&nop_mnt_idmap, inode, attr);
return 0;
}
@@ -717,7 +762,8 @@ static bool has_pid_permissions(struct proc_fs_info *fs_info,
}
-static int proc_pid_permission(struct inode *inode, int mask)
+static int proc_pid_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask)
{
struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
struct task_struct *task;
@@ -742,7 +788,7 @@ static int proc_pid_permission(struct inode *inode, int mask)
return -EPERM;
}
- return generic_permission(inode, mask);
+ return generic_permission(&nop_mnt_idmap, inode, mask);
}
@@ -781,23 +827,31 @@ static const struct file_operations proc_single_file_operations = {
.release = single_release,
};
-
+/*
+ * proc_mem_open() can return errno, NULL or mm_struct*.
+ *
+ * - Returns NULL if the task has no mm (PF_KTHREAD or PF_EXITING)
+ * - Returns mm_struct* on success
+ * - Returns error code on failure
+ */
struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
{
struct task_struct *task = get_proc_task(inode);
- struct mm_struct *mm = ERR_PTR(-ESRCH);
+ struct mm_struct *mm;
- if (task) {
- mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
- put_task_struct(task);
+ if (!task)
+ return ERR_PTR(-ESRCH);
- if (!IS_ERR_OR_NULL(mm)) {
- /* ensure this mm_struct can't be freed */
- mmgrab(mm);
- /* but do not pin its memory */
- mmput(mm);
- }
- }
+ mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
+ put_task_struct(task);
+
+ if (IS_ERR(mm))
+ return mm == ERR_PTR(-ESRCH) ? NULL : mm;
+
+ /* ensure this mm_struct can't be freed */
+ mmgrab(mm);
+ /* but do not pin its memory */
+ mmput(mm);
return mm;
}
@@ -806,8 +860,8 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
{
struct mm_struct *mm = proc_mem_open(inode, mode);
- if (IS_ERR(mm))
- return PTR_ERR(mm);
+ if (IS_ERR_OR_NULL(mm))
+ return mm ? PTR_ERR(mm) : -ESRCH;
file->private_data = mm;
return 0;
@@ -815,12 +869,31 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
static int mem_open(struct inode *inode, struct file *file)
{
- int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
-
- /* OK to pass negative loff_t, we can catch out-of-range */
- file->f_mode |= FMODE_UNSIGNED_OFFSET;
+ if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET)))
+ return -EINVAL;
+ return __mem_open(inode, file, PTRACE_MODE_ATTACH);
+}
- return ret;
+static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm)
+{
+ struct task_struct *task;
+ bool ptrace_active = false;
+
+ switch (proc_mem_force_override) {
+ case PROC_MEM_FORCE_NEVER:
+ return false;
+ case PROC_MEM_FORCE_PTRACE:
+ task = get_proc_task(file_inode(file));
+ if (task) {
+ ptrace_active = READ_ONCE(task->ptrace) &&
+ READ_ONCE(task->mm) == mm &&
+ READ_ONCE(task->parent) == current;
+ put_task_struct(task);
+ }
+ return ptrace_active;
+ default:
+ return true;
+ }
}
static ssize_t mem_rw(struct file *file, char __user *buf,
@@ -843,10 +916,12 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
if (!mmget_not_zero(mm))
goto free;
- flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
+ flags = write ? FOLL_WRITE : 0;
+ if (proc_mem_foll_force(file, mm))
+ flags |= FOLL_FORCE;
while (count > 0) {
- int this_len = min_t(int, count, PAGE_SIZE);
+ size_t this_len = min_t(size_t, count, PAGE_SIZE);
if (write && copy_from_user(page, buf, this_len)) {
copied = -EFAULT;
@@ -920,6 +995,7 @@ static const struct file_operations proc_mem_operations = {
.write = mem_write,
.open = mem_open,
.release = mem_release,
+ .fop_flags = FOP_UNSIGNED_OFFSET,
};
static int environ_open(struct inode *inode, struct file *file)
@@ -1040,13 +1116,14 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
OOM_SCORE_ADJ_MAX;
put_task_struct(task);
+ if (oom_adj > OOM_ADJUST_MAX)
+ oom_adj = OOM_ADJUST_MAX;
len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
return simple_read_from_buffer(buf, count, ppos, buffer, len);
}
static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
{
- static DEFINE_MUTEX(oom_adj_mutex);
struct mm_struct *mm = NULL;
struct task_struct *task;
int err = 0;
@@ -1086,7 +1163,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
struct task_struct *p = find_lock_task_mm(task);
if (p) {
- if (atomic_read(&p->mm->mm_users) > 1) {
+ if (mm_flags_test(MMF_MULTIPROCESS, p->mm)) {
mm = p->mm;
mmgrab(mm);
}
@@ -1141,11 +1218,10 @@ err_unlock:
static ssize_t oom_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int oom_adj;
int err;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count)) {
@@ -1201,11 +1277,10 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int oom_score_adj;
int err;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count)) {
@@ -1260,6 +1335,10 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
kuid_t kloginuid;
int rv;
+ /* Don't let kthreads write their own loginuid */
+ if (current->flags & PF_KTHREAD)
+ return -EPERM;
+
rcu_read_lock();
if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
rcu_read_unlock();
@@ -1342,13 +1421,13 @@ static ssize_t proc_fault_inject_write(struct file * file,
const char __user * buf, size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int make_it_fail;
int rv;
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- memset(buffer, 0, sizeof(buffer));
+
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
@@ -1416,7 +1495,6 @@ static const struct file_operations proc_fail_nth_operations = {
#endif
-#ifdef CONFIG_SCHED_DEBUG
/*
* Print out various scheduling related per-task fields:
*/
@@ -1466,8 +1544,6 @@ static const struct file_operations proc_pid_sched_operations = {
.release = single_release,
};
-#endif
-
#ifdef CONFIG_SCHED_AUTOGROUP
/*
* Print out autogroup related information:
@@ -1493,11 +1569,10 @@ sched_autogroup_write(struct file *file, const char __user *buf,
{
struct inode *inode = file_inode(file);
struct task_struct *p;
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int nice;
int err;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
@@ -1650,10 +1725,9 @@ static ssize_t comm_write(struct file *file, const char __user *buf,
{
struct inode *inode = file_inode(file);
struct task_struct *p;
- char buffer[TASK_COMM_LEN];
+ char buffer[TASK_COMM_LEN] = {};
const size_t maxlen = sizeof(buffer) - 1;
- memset(buffer, 0, sizeof(buffer));
if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
return -EFAULT;
@@ -1661,8 +1735,10 @@ static ssize_t comm_write(struct file *file, const char __user *buf,
if (!p)
return -ESRCH;
- if (same_thread_group(current, p))
+ if (same_thread_group(current, p)) {
set_task_comm(p, buffer);
+ proc_comm_connector(p);
+ }
else
count = -EINVAL;
@@ -1743,27 +1819,27 @@ out:
return ERR_PTR(error);
}
-static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
+static int do_proc_readlink(const struct path *path, char __user *buffer, int buflen)
{
- char *tmp = (char *)__get_free_page(GFP_KERNEL);
+ char *tmp = kmalloc(PATH_MAX, GFP_KERNEL);
char *pathname;
int len;
if (!tmp)
return -ENOMEM;
- pathname = d_path(path, tmp, PAGE_SIZE);
+ pathname = d_path(path, tmp, PATH_MAX);
len = PTR_ERR(pathname);
if (IS_ERR(pathname))
goto out;
- len = tmp + PAGE_SIZE - 1 - pathname;
+ len = tmp + PATH_MAX - 1 - pathname;
if (len > buflen)
len = buflen;
if (copy_to_user(buffer, pathname, len))
len = -EFAULT;
out:
- free_page((unsigned long)tmp);
+ kfree(tmp);
return len;
}
@@ -1863,11 +1939,9 @@ void proc_pid_evict_inode(struct proc_inode *ei)
hlist_del_init_rcu(&ei->sibling_inodes);
spin_unlock(&pid->lock);
}
-
- put_pid(pid);
}
-struct inode *proc_pid_make_inode(struct super_block * sb,
+struct inode *proc_pid_make_inode(struct super_block *sb,
struct task_struct *task, umode_t mode)
{
struct inode * inode;
@@ -1884,7 +1958,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
ei = PROC_I(inode);
inode->i_mode = mode;
inode->i_ino = get_next_ino();
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ simple_inode_init_ts(inode);
inode->i_op = &proc_def_inode_operations;
/*
@@ -1896,11 +1970,6 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
/* Let the pid remember us for quick removal */
ei->pid = pid;
- if (S_ISDIR(mode)) {
- spin_lock(&pid->lock);
- hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
- spin_unlock(&pid->lock);
- }
task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
security_task_to_inode(task, inode);
@@ -1913,14 +1982,47 @@ out_unlock:
return NULL;
}
-int pid_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int query_flags)
+/*
+ * Generating an inode and adding it into @pid->inodes, so that task will
+ * invalidate inode's dentry before being released.
+ *
+ * This helper is used for creating dir-type entries under '/proc' and
+ * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>'
+ * can be released by invalidating '/proc/<tgid>' dentry.
+ * In theory, dentries under '/proc/<tgid>/task' can also be released by
+ * invalidating '/proc/<tgid>' dentry, we reserve it to handle single
+ * thread exiting situation: Any one of threads should invalidate its
+ * '/proc/<tgid>/task/<pid>' dentry before released.
+ */
+static struct inode *proc_pid_make_base_inode(struct super_block *sb,
+ struct task_struct *task, umode_t mode)
+{
+ struct inode *inode;
+ struct proc_inode *ei;
+ struct pid *pid;
+
+ inode = proc_pid_make_inode(sb, task, mode);
+ if (!inode)
+ return NULL;
+
+ /* Let proc_flush_pid find this directory inode */
+ ei = PROC_I(inode);
+ pid = ei->pid;
+ spin_lock(&pid->lock);
+ hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
+ spin_unlock(&pid->lock);
+
+ return inode;
+}
+
+int pid_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
struct task_struct *task;
- generic_fillattr(inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->uid = GLOBAL_ROOT_UID;
stat->gid = GLOBAL_ROOT_GID;
@@ -1959,23 +2061,26 @@ void pid_update_inode(struct task_struct *task, struct inode *inode)
* performed a setuid(), etc.
*
*/
-static int pid_revalidate(struct dentry *dentry, unsigned int flags)
+static int pid_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
struct task_struct *task;
+ int ret = 0;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
- inode = d_inode(dentry);
- task = get_proc_task(inode);
+ rcu_read_lock();
+ inode = d_inode_rcu(dentry);
+ if (!inode)
+ goto out;
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task) {
pid_update_inode(task, inode);
- put_task_struct(task);
- return 1;
+ ret = 1;
}
- return 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
static inline bool proc_inode_is_dead(struct inode *inode)
@@ -2007,7 +2112,7 @@ const struct dentry_operations pid_dentry_operations =
* file type from dcache entry.
*
* Since all of the proc inode numbers are dynamically generated, the inode
- * numbers do not exist until the inode is cache. This means creating the
+ * numbers do not exist until the inode is cache. This means creating
* the dcache entry in readdir is necessary to keep the inode numbers
* reported by readdir in sync with the inode numbers reported
* by stat.
@@ -2022,7 +2127,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
unsigned type = DT_UNKNOWN;
ino_t ino = 1;
- child = d_hash_and_lookup(dir, &qname);
+ child = try_lookup_noperm(&qname, dir);
if (!child) {
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
child = d_alloc_parallel(dir, &qname, &wq);
@@ -2090,7 +2195,8 @@ static int dname_to_vma_addr(struct dentry *dentry,
return 0;
}
-static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int map_files_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
unsigned long vm_start, vm_end;
bool exact_vma_exists = false;
@@ -2108,7 +2214,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
goto out_notask;
mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
- if (IS_ERR_OR_NULL(mm))
+ if (IS_ERR(mm))
goto out;
if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
@@ -2170,7 +2276,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
rc = -ENOENT;
vma = find_exact_vma(mm, vm_start, vm_end);
if (vma && vma->vm_file) {
- *path = vma->vm_file->f_path;
+ *path = *file_user_path(vma->vm_file);
path_get(path);
rc = 0;
}
@@ -2189,16 +2295,16 @@ struct map_files_info {
};
/*
- * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
- * symlinks may be used to bypass permissions on ancestor directories in the
- * path to the file in question.
+ * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due
+ * to concerns about how the symlinks may be used to bypass permissions on
+ * ancestor directories in the path to the file in question.
*/
static const char *
proc_map_files_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- if (!capable(CAP_SYS_ADMIN))
+ if (!checkpoint_restore_ns_capable(&init_user_ns))
return ERR_PTR(-EPERM);
return proc_pid_get_link(dentry, inode, done);
@@ -2233,8 +2339,8 @@ proc_map_files_instantiate(struct dentry *dentry,
inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
- d_set_d_op(dentry, &tid_map_files_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return proc_splice_unmountable(inode, dentry,
+ &tid_map_files_dentry_operations);
}
static struct dentry *proc_map_files_lookup(struct inode *dir,
@@ -2302,6 +2408,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
GENRADIX(struct map_files_info) fa;
struct map_files_info *p;
int ret;
+ struct vma_iterator vmi;
genradix_init(&fa);
@@ -2340,7 +2447,9 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
* routine might require mmap_lock taken in might_fault().
*/
- for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+ pos = 2;
+ vma_iter_init(&vmi, mm, 0);
+ for_each_vma(vmi, vma) {
if (!vma->vm_file)
continue;
if (++pos <= ctx->pos)
@@ -2391,11 +2500,9 @@ static const struct file_operations proc_map_files_operations = {
#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
struct timers_private {
- struct pid *pid;
- struct task_struct *task;
- struct sighand_struct *sighand;
- struct pid_namespace *ns;
- unsigned long flags;
+ struct pid *pid;
+ struct task_struct *task;
+ struct pid_namespace *ns;
};
static void *timers_start(struct seq_file *m, loff_t *pos)
@@ -2406,54 +2513,48 @@ static void *timers_start(struct seq_file *m, loff_t *pos)
if (!tp->task)
return ERR_PTR(-ESRCH);
- tp->sighand = lock_task_sighand(tp->task, &tp->flags);
- if (!tp->sighand)
- return ERR_PTR(-ESRCH);
-
- return seq_list_start(&tp->task->signal->posix_timers, *pos);
+ rcu_read_lock();
+ return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos);
}
static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
{
struct timers_private *tp = m->private;
- return seq_list_next(v, &tp->task->signal->posix_timers, pos);
+
+ return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos);
}
static void timers_stop(struct seq_file *m, void *v)
{
struct timers_private *tp = m->private;
- if (tp->sighand) {
- unlock_task_sighand(tp->task, &tp->flags);
- tp->sighand = NULL;
- }
-
if (tp->task) {
put_task_struct(tp->task);
tp->task = NULL;
+ rcu_read_unlock();
}
}
static int show_timer(struct seq_file *m, void *v)
{
- struct k_itimer *timer;
- struct timers_private *tp = m->private;
- int notify;
static const char * const nstr[] = {
- [SIGEV_SIGNAL] = "signal",
- [SIGEV_NONE] = "none",
- [SIGEV_THREAD] = "thread",
+ [SIGEV_SIGNAL] = "signal",
+ [SIGEV_NONE] = "none",
+ [SIGEV_THREAD] = "thread",
};
- timer = list_entry((struct list_head *)v, struct k_itimer, list);
- notify = timer->it_sigev_notify;
+ struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
+ struct timers_private *tp = m->private;
+ int notify = timer->it_sigev_notify;
+
+ guard(spinlock_irq)(&timer->it_lock);
+ if (!posixtimer_valid(timer))
+ return 0;
seq_printf(m, "ID: %d\n", timer->it_id);
- seq_printf(m, "signal: %d/%px\n",
- timer->sigq->info.si_signo,
- timer->sigq->info.si_value.sival_ptr);
- seq_printf(m, "notify: %s/%s.%d\n",
- nstr[notify & ~SIGEV_THREAD_ID],
+ seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo,
+ timer->sigq.info.si_value.sival_ptr);
+ seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID],
(notify & SIGEV_THREAD_ID) ? "tid" : "pid",
pid_nr_ns(timer->it_pid, tp->ns));
seq_printf(m, "ClockID: %d\n", timer->it_clock);
@@ -2523,10 +2624,11 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
}
task_lock(p);
- if (slack_ns == 0)
- p->timer_slack_ns = p->default_timer_slack_ns;
- else
- p->timer_slack_ns = slack_ns;
+ if (rt_or_dl_task_policy(p))
+ slack_ns = 0;
+ else if (slack_ns == 0)
+ slack_ns = p->default_timer_slack_ns;
+ p->timer_slack_ns = slack_ns;
task_unlock(p);
out:
@@ -2602,8 +2704,7 @@ static struct dentry *proc_pident_instantiate(struct dentry *dentry,
inode->i_fop = p->fop;
ei->op = p->op;
pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
static struct dentry *proc_pident_lookup(struct inode *dir,
@@ -2661,6 +2762,13 @@ out:
}
#ifdef CONFIG_SECURITY
+static int proc_pid_attr_open(struct inode *inode, struct file *file)
+{
+ file->private_data = NULL;
+ __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
+ return 0;
+}
+
static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
@@ -2672,8 +2780,8 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
if (!task)
return -ESRCH;
- length = security_getprocattr(task, PROC_I(inode)->op.lsm,
- (char*)file->f_path.dentry->d_name.name,
+ length = security_getprocattr(task, PROC_I(inode)->op.lsmid,
+ file->f_path.dentry->d_name.name,
&p);
put_task_struct(task);
if (length > 0)
@@ -2690,6 +2798,10 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
void *page;
int rv;
+ /* A task may only write when it was the opener. */
+ if (file->private_data != current->mm)
+ return -EPERM;
+
rcu_read_lock();
task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (!task) {
@@ -2726,7 +2838,7 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
if (rv < 0)
goto out_free;
- rv = security_setprocattr(PROC_I(inode)->op.lsm,
+ rv = security_setprocattr(PROC_I(inode)->op.lsmid,
file->f_path.dentry->d_name.name, page,
count);
mutex_unlock(&current->signal->cred_guard_mutex);
@@ -2737,9 +2849,11 @@ out:
}
static const struct file_operations proc_pid_attr_operations = {
+ .open = proc_pid_attr_open,
.read = proc_pid_attr_read,
.write = proc_pid_attr_write,
.llseek = generic_file_llseek,
+ .release = mem_release,
};
#define LSM_DIR_OPS(LSM) \
@@ -2753,7 +2867,7 @@ static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
\
static const struct file_operations proc_##LSM##_attr_dir_ops = { \
.read = generic_read_dir, \
- .iterate = proc_##LSM##_attr_dir_iterate, \
+ .iterate_shared = proc_##LSM##_attr_dir_iterate, \
.llseek = default_llseek, \
}; \
\
@@ -2773,27 +2887,27 @@ static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
#ifdef CONFIG_SECURITY_SMACK
static const struct pid_entry smack_attr_dir_stuff[] = {
- ATTR("smack", "current", 0666),
+ ATTR(LSM_ID_SMACK, "current", 0666),
};
LSM_DIR_OPS(smack);
#endif
#ifdef CONFIG_SECURITY_APPARMOR
static const struct pid_entry apparmor_attr_dir_stuff[] = {
- ATTR("apparmor", "current", 0666),
- ATTR("apparmor", "prev", 0444),
- ATTR("apparmor", "exec", 0666),
+ ATTR(LSM_ID_APPARMOR, "current", 0666),
+ ATTR(LSM_ID_APPARMOR, "prev", 0444),
+ ATTR(LSM_ID_APPARMOR, "exec", 0666),
};
LSM_DIR_OPS(apparmor);
#endif
static const struct pid_entry attr_dir_stuff[] = {
- ATTR(NULL, "current", 0666),
- ATTR(NULL, "prev", 0444),
- ATTR(NULL, "exec", 0666),
- ATTR(NULL, "fscreate", 0666),
- ATTR(NULL, "keycreate", 0666),
- ATTR(NULL, "sockcreate", 0666),
+ ATTR(LSM_ID_UNDEF, "current", 0666),
+ ATTR(LSM_ID_UNDEF, "prev", 0444),
+ ATTR(LSM_ID_UNDEF, "exec", 0666),
+ ATTR(LSM_ID_UNDEF, "fscreate", 0666),
+ ATTR(LSM_ID_UNDEF, "keycreate", 0666),
+ ATTR(LSM_ID_UNDEF, "sockcreate", 0666),
#ifdef CONFIG_SECURITY_SMACK
DIR("smack", 0555,
proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
@@ -2848,8 +2962,10 @@ static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
ret = 0;
mm = get_task_mm(task);
if (mm) {
+ unsigned long flags = __mm_flags_get_dumpable(mm);
+
len = snprintf(buffer, sizeof(buffer), "%08lx\n",
- ((mm->flags & MMF_DUMP_FILTER_MASK) >>
+ ((flags & MMF_DUMP_FILTER_MASK) >>
MMF_DUMP_FILTER_SHIFT));
mmput(mm);
ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
@@ -2888,9 +3004,9 @@ static ssize_t proc_coredump_filter_write(struct file *file,
for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
if (val & mask)
- set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+ mm_flags_set(i + MMF_DUMP_FILTER_SHIFT, mm);
else
- clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+ mm_flags_clear(i + MMF_DUMP_FILTER_SHIFT, mm);
}
mmput(mm);
@@ -2912,11 +3028,10 @@ static const struct file_operations proc_coredump_filter_operations = {
#ifdef CONFIG_TASK_IO_ACCOUNTING
static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
{
- struct task_io_accounting acct = task->ioac;
- unsigned long flags;
+ struct task_io_accounting acct;
int result;
- result = mutex_lock_killable(&task->signal->exec_update_mutex);
+ result = down_read_killable(&task->signal->exec_update_lock);
if (result)
return result;
@@ -2925,15 +3040,21 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
goto out_unlock;
}
- if (whole && lock_task_sighand(task, &flags)) {
- struct task_struct *t = task;
+ if (whole) {
+ struct signal_struct *sig = task->signal;
+ struct task_struct *t;
- task_io_accounting_add(&acct, &task->signal->ioac);
- while_each_thread(task, t)
- task_io_accounting_add(&acct, &t->ioac);
+ guard(rcu)();
+ scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) {
+ acct = sig->ioac;
+ __for_each_thread(sig, t)
+ task_io_accounting_add(&acct, &t->ioac);
- unlock_task_sighand(task, &flags);
+ }
+ } else {
+ acct = task->ioac;
}
+
seq_printf(m,
"rchar: %llu\n"
"wchar: %llu\n"
@@ -2952,7 +3073,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
result = 0;
out_unlock:
- mutex_unlock(&task->signal->exec_update_mutex);
+ up_read(&task->signal->exec_update_lock);
return result;
}
@@ -3121,7 +3242,50 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
}
#endif /* CONFIG_LIVEPATCH */
-#ifdef CONFIG_STACKLEAK_METRICS
+#ifdef CONFIG_KSM
+static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct mm_struct *mm;
+
+ mm = get_task_mm(task);
+ if (mm) {
+ seq_printf(m, "%lu\n", mm->ksm_merging_pages);
+ mmput(mm);
+ }
+
+ return 0;
+}
+static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct mm_struct *mm;
+ int ret = 0;
+
+ mm = get_task_mm(task);
+ if (mm) {
+ seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
+ seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm));
+ seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
+ seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
+ seq_printf(m, "ksm_merge_any: %s\n",
+ mm_flags_test(MMF_VM_MERGE_ANY, mm) ? "yes" : "no");
+ ret = mmap_read_lock_killable(mm);
+ if (ret) {
+ mmput(mm);
+ return ret;
+ }
+ seq_printf(m, "ksm_mergeable: %s\n",
+ ksm_process_mergeable(mm) ? "yes" : "no");
+ mmap_read_unlock(mm);
+ mmput(mm);
+ }
+
+ return 0;
+}
+#endif /* CONFIG_KSM */
+
+#ifdef CONFIG_KSTACK_ERASE_METRICS
static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
@@ -3134,7 +3298,7 @@ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
prev_depth, depth);
return 0;
}
-#endif /* CONFIG_STACKLEAK_METRICS */
+#endif /* CONFIG_KSTACK_ERASE_METRICS */
/*
* Thread groups
@@ -3146,7 +3310,7 @@ static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
- DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
@@ -3156,9 +3320,7 @@ static const struct pid_entry tgid_base_stuff[] = {
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
ONE("limits", S_IRUGO, proc_pid_limits),
-#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
-#endif
#ifdef CONFIG_SCHED_AUTOGROUP
REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
#endif
@@ -3243,12 +3405,19 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_LIVEPATCH
ONE("patch_state", S_IRUSR, proc_pid_patch_state),
#endif
-#ifdef CONFIG_STACKLEAK_METRICS
+#ifdef CONFIG_KSTACK_ERASE_METRICS
ONE("stack_depth", S_IRUGO, proc_stack_depth),
#endif
#ifdef CONFIG_PROC_PID_ARCH_STATUS
ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+ ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
+#ifdef CONFIG_KSM
+ ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
+ ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat),
+#endif
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3314,7 +3483,8 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry,
{
struct inode *inode;
- inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+ inode = proc_pid_make_base_inode(dentry->d_sb, task,
+ S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
return ERR_PTR(-ENOENT);
@@ -3325,8 +3495,7 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry,
set_nlink(inode, nlink_tgid);
pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
@@ -3409,14 +3578,12 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
return 0;
if (pos == TGID_OFFSET - 2) {
- struct inode *inode = d_inode(fs_info->proc_self);
- if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
+ if (!dir_emit(ctx, "self", 4, self_inum, DT_LNK))
return 0;
ctx->pos = pos = pos + 1;
}
if (pos == TGID_OFFSET - 1) {
- struct inode *inode = d_inode(fs_info->proc_thread_self);
- if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
+ if (!dir_emit(ctx, "thread-self", 11, thread_self_inum, DT_LNK))
return 0;
ctx->pos = pos = pos + 1;
}
@@ -3456,7 +3623,8 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
* This function makes sure that the node is always accessible for members of
* same thread group.
*/
-static int proc_tid_comm_permission(struct inode *inode, int mask)
+static int proc_tid_comm_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask)
{
bool is_same_tgroup;
struct task_struct *task;
@@ -3475,11 +3643,12 @@ static int proc_tid_comm_permission(struct inode *inode, int mask)
return 0;
}
- return generic_permission(inode, mask);
+ return generic_permission(&nop_mnt_idmap, inode, mask);
}
static const struct inode_operations proc_tid_comm_inode_operations = {
- .permission = proc_tid_comm_permission,
+ .setattr = proc_setattr,
+ .permission = proc_tid_comm_permission,
};
/*
@@ -3487,7 +3656,7 @@ static const struct inode_operations proc_tid_comm_inode_operations = {
*/
static const struct pid_entry tid_base_stuff[] = {
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
- DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
@@ -3497,9 +3666,7 @@ static const struct pid_entry tid_base_stuff[] = {
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
ONE("limits", S_IRUGO, proc_pid_limits),
-#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
-#endif
NOD("comm", S_IFREG|S_IRUGO|S_IWUSR,
&proc_tid_comm_inode_operations,
&proc_pid_set_comm_operations, {}),
@@ -3578,6 +3745,13 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_PROC_PID_ARCH_STATUS
ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+ ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
+#ifdef CONFIG_KSM
+ ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
+ ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat),
+#endif
};
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3609,7 +3783,8 @@ static struct dentry *proc_task_instantiate(struct dentry *dentry,
struct task_struct *task, const void *ptr)
{
struct inode *inode;
- inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+ inode = proc_pid_make_base_inode(dentry->d_sb, task,
+ S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
return ERR_PTR(-ENOENT);
@@ -3620,8 +3795,7 @@ static struct dentry *proc_task_instantiate(struct dentry *dentry,
set_nlink(inode, nlink_tid);
pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
@@ -3701,11 +3875,10 @@ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
/* If we haven't found our starting place yet start
* with the leader and walk nr threads forward.
*/
- pos = task = task->group_leader;
- do {
+ for_each_thread(task, pos) {
if (!nr--)
goto found;
- } while_each_thread(task, pos);
+ }
fail:
pos = NULL;
goto out;
@@ -3727,10 +3900,8 @@ static struct task_struct *next_tid(struct task_struct *start)
struct task_struct *pos = NULL;
rcu_read_lock();
if (pid_alive(start)) {
- pos = next_thread(start);
- if (thread_group_leader(pos))
- pos = NULL;
- else
+ pos = __next_thread(start);
+ if (pos)
get_task_struct(pos);
}
rcu_read_unlock();
@@ -3752,24 +3923,27 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit_dots(file, ctx))
return 0;
- /* f_version caches the tgid value that the last readdir call couldn't
- * return. lseek aka telldir automagically resets f_version to 0.
+ /* We cache the tgid value that the last readdir call couldn't
+ * return and lseek resets it to 0.
*/
ns = proc_pid_ns(inode->i_sb);
- tid = (int)file->f_version;
- file->f_version = 0;
+ tid = (int)(intptr_t)file->private_data;
+ file->private_data = NULL;
for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
task;
task = next_tid(task), ctx->pos++) {
char name[10 + 1];
unsigned int len;
+
tid = task_pid_nr_ns(task, ns);
- len = snprintf(name, sizeof(name), "%u", tid);
+ if (!tid)
+ continue; /* The task has just exited. */
+ len = snprintf(name, sizeof(name), "%d", tid);
if (!proc_fill_cache(file, ctx, name, len,
proc_task_instantiate, task, NULL)) {
/* returning this tgid failed, save it as the first
* pid for the next readir call */
- file->f_version = (u64)tid;
+ file->private_data = (void *)(intptr_t)tid;
put_task_struct(task);
break;
}
@@ -3778,12 +3952,13 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
return 0;
}
-static int proc_task_getattr(const struct path *path, struct kstat *stat,
+static int proc_task_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
struct task_struct *p = get_proc_task(inode);
- generic_fillattr(inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (p) {
stat->nlink += get_nr_threads(p);
@@ -3793,6 +3968,24 @@ static int proc_task_getattr(const struct path *path, struct kstat *stat,
return 0;
}
+/*
+ * proc_task_readdir() set @file->private_data to a positive integer
+ * value, so casting that to u64 is safe. generic_llseek_cookie() will
+ * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is
+ * here to catch any unexpected change in behavior either in
+ * proc_task_readdir() or generic_llseek_cookie().
+ */
+static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ u64 cookie = (u64)(intptr_t)file->private_data;
+ loff_t off;
+
+ off = generic_llseek_cookie(file, offset, whence, &cookie);
+ WARN_ON_ONCE(cookie > INT_MAX);
+ file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */
+ return off;
+}
+
static const struct inode_operations proc_task_inode_operations = {
.lookup = proc_task_lookup,
.getattr = proc_task_getattr,
@@ -3803,7 +3996,7 @@ static const struct inode_operations proc_task_inode_operations = {
static const struct file_operations proc_task_operations = {
.read = generic_read_dir,
.iterate_shared = proc_task_readdir,
- .llseek = generic_file_llseek,
+ .llseek = proc_dir_llseek,
};
void __init set_proc_pid_nlink(void)
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index 9955d75c0585..87dcaae32ff8 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -26,11 +26,14 @@ static int boot_config_proc_show(struct seq_file *m, void *v)
static int __init copy_xbc_key_value_list(char *dst, size_t size)
{
struct xbc_node *leaf, *vnode;
- const char *val;
char *key, *end = dst + size;
+ const char *val;
+ char q;
int ret = 0;
key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL);
+ if (!key)
+ return -ENOMEM;
xbc_for_each_key_value(leaf, val) {
ret = xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX);
@@ -41,21 +44,31 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size)
break;
dst += ret;
vnode = xbc_node_get_child(leaf);
- if (vnode && xbc_node_is_array(vnode)) {
+ if (vnode) {
xbc_array_for_each_value(vnode, val) {
- ret = snprintf(dst, rest(dst, end), "\"%s\"%s",
- val, vnode->next ? ", " : "\n");
+ if (strchr(val, '"'))
+ q = '\'';
+ else
+ q = '"';
+ ret = snprintf(dst, rest(dst, end), "%c%s%c%s",
+ q, val, q, xbc_node_is_array(vnode) ? ", " : "\n");
if (ret < 0)
goto out;
dst += ret;
}
} else {
- ret = snprintf(dst, rest(dst, end), "\"%s\"\n", val);
+ ret = snprintf(dst, rest(dst, end), "\"\"\n");
if (ret < 0)
break;
dst += ret;
}
}
+ if (cmdline_has_extra_options() && ret >= 0 && boot_command_line[0]) {
+ ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n",
+ boot_command_line);
+ if (ret > 0)
+ dst += ret;
+ }
out:
kfree(key);
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index fa762c5fbcb2..a6f76121955f 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -3,6 +3,7 @@
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include "internal.h"
static int cmdline_proc_show(struct seq_file *m, void *v)
{
@@ -13,7 +14,11 @@ static int cmdline_proc_show(struct seq_file *m, void *v)
static int __init proc_cmdline_init(void)
{
- proc_create_single("cmdline", 0, NULL, cmdline_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("cmdline", 0, NULL, cmdline_proc_show);
+ pde_make_permanent(pde);
+ pde->size = saved_command_line_len + 1;
return 0;
}
fs_initcall(proc_cmdline_init);
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index dfe6ce3505ce..b7cab1ad990d 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -21,6 +21,7 @@ static int show_console_dev(struct seq_file *m, void *v)
{ CON_ENABLED, 'E' },
{ CON_CONSDEV, 'C' },
{ CON_BOOT, 'B' },
+ { CON_NBCON, 'N' },
{ CON_PRINTBUFFER, 'p' },
{ CON_BRL, 'b' },
{ CON_ANYTIME, 'a' },
@@ -33,7 +34,16 @@ static int show_console_dev(struct seq_file *m, void *v)
if (con->device) {
const struct tty_driver *driver;
int index;
+
+ /*
+ * Take console_lock to serialize device() callback with
+ * other console operations. For example, fg_console is
+ * modified under console_lock when switching vt.
+ */
+ console_lock();
driver = con->device(con, &index);
+ console_unlock();
+
if (driver) {
dev = MKDEV(driver->major, driver->minor_start);
dev += index;
@@ -49,8 +59,8 @@ static int show_console_dev(struct seq_file *m, void *v)
seq_printf(m, "%s%d", con->name, con->index);
seq_pad(m, ' ');
seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-',
- con->write ? 'W' : '-', con->unblank ? 'U' : '-',
- flags);
+ ((con->flags & CON_NBCON) || con->write) ? 'W' : '-',
+ con->unblank ? 'U' : '-', flags);
if (dev)
seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
@@ -59,11 +69,17 @@ static int show_console_dev(struct seq_file *m, void *v)
}
static void *c_start(struct seq_file *m, loff_t *pos)
+ __acquires(&console_mutex)
{
struct console *con;
loff_t off = 0;
- console_lock();
+ /*
+ * Hold the console_list_lock to guarantee safe traversal of the
+ * console list. SRCU cannot be used because there is no
+ * place to store the SRCU cookie.
+ */
+ console_list_lock();
for_each_console(con)
if (off++ == *pos)
break;
@@ -74,13 +90,15 @@ static void *c_start(struct seq_file *m, loff_t *pos)
static void *c_next(struct seq_file *m, void *v, loff_t *pos)
{
struct console *con = v;
+
++*pos;
- return con->next;
+ return hlist_entry_safe(con->node.next, struct console, node);
}
static void c_stop(struct seq_file *m, void *v)
+ __releases(&console_mutex)
{
- console_unlock();
+ console_list_unlock();
}
static const struct seq_operations consoles_op = {
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index d0989a443c77..f38bda5b83ec 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -5,21 +5,17 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-__weak void arch_freq_prepare_all(void)
-{
-}
-
extern const struct seq_operations cpuinfo_op;
+
static int cpuinfo_open(struct inode *inode, struct file *file)
{
- arch_freq_prepare_all();
return seq_open(file, &cpuinfo_op);
}
static const struct proc_ops cpuinfo_proc_ops = {
.proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = cpuinfo_open,
- .proc_read = seq_read,
+ .proc_read_iter = seq_read_iter,
.proc_lseek = seq_lseek,
.proc_release = seq_release,
};
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 37d38697eaf8..fe7bfcb7d049 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -3,6 +3,8 @@
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/blkdev.h>
+#include "internal.h"
static int devinfo_show(struct seq_file *f, void *v)
{
@@ -53,7 +55,10 @@ static const struct seq_operations devinfo_ops = {
static int __init proc_devices_init(void)
{
- proc_create_seq("devices", 0, NULL, &devinfo_ops);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_seq("devices", 0, NULL, &devinfo_ops);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_devices_init);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 81882a13212d..9eeccff49b2a 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -6,10 +6,13 @@
#include <linux/fdtable.h>
#include <linux/namei.h>
#include <linux/pid.h>
+#include <linux/ptrace.h>
+#include <linux/bitmap.h>
#include <linux/security.h>
#include <linux/file.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
+#include <linux/filelock.h>
#include <linux/proc_fs.h>
@@ -28,35 +31,35 @@ static int seq_show(struct seq_file *m, void *v)
if (!task)
return -ENOENT;
- files = get_files_struct(task);
- put_task_struct(task);
-
+ task_lock(task);
+ files = task->files;
if (files) {
unsigned int fd = proc_fd(m->private);
spin_lock(&files->file_lock);
- file = fcheck_files(files, fd);
+ file = files_lookup_fd_locked(files, fd);
if (file) {
- struct fdtable *fdt = files_fdtable(files);
-
f_flags = file->f_flags;
- if (close_on_exec(fd, fdt))
+ if (close_on_exec(fd, files))
f_flags |= O_CLOEXEC;
get_file(file);
ret = 0;
}
spin_unlock(&files->file_lock);
- put_files_struct(files);
}
+ task_unlock(task);
+ put_task_struct(task);
if (ret)
return ret;
- seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n",
+ seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%lu\n",
(long long)file->f_pos, f_flags,
- real_mount(file->f_path.mnt)->mnt_id);
+ real_mount(file->f_path.mnt)->mnt_id,
+ file_inode(file)->i_ino);
+ /* show_fd_locks() never dereferences files, so a stale value is safe */
show_fd_locks(m, file, files);
if (seq_has_overflowed(m))
goto out;
@@ -74,6 +77,34 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file)
return single_open(file, seq_show, inode);
}
+/*
+ * Shared /proc/pid/fdinfo and /proc/pid/fdinfo/fd permission helper to ensure
+ * that the current task has PTRACE_MODE_READ in addition to the normal
+ * POSIX-like checks.
+ */
+static int proc_fdinfo_permission(struct mnt_idmap *idmap, struct inode *inode,
+ int mask)
+{
+ bool allowed = false;
+ struct task_struct *task = get_proc_task(inode);
+
+ if (!task)
+ return -ESRCH;
+
+ allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
+ put_task_struct(task);
+
+ if (!allowed)
+ return -EACCES;
+
+ return generic_permission(idmap, inode, mask);
+}
+
+static const struct inode_operations proc_fdinfo_file_inode_operations = {
+ .permission = proc_fdinfo_permission,
+ .setattr = proc_setattr,
+};
+
static const struct file_operations proc_fdinfo_file_operations = {
.open = seq_fdinfo_open,
.read = seq_read,
@@ -83,18 +114,13 @@ static const struct file_operations proc_fdinfo_file_operations = {
static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
{
- struct files_struct *files = get_files_struct(task);
struct file *file;
- if (!files)
- return false;
-
- rcu_read_lock();
- file = fcheck_files(files, fd);
- if (file)
+ file = fget_task(task, fd);
+ if (file) {
*mode = file->f_mode;
- rcu_read_unlock();
- put_files_struct(files);
+ fput(file);
+ }
return !!file;
}
@@ -114,7 +140,8 @@ static void tid_fd_update_inode(struct task_struct *task, struct inode *inode,
security_task_to_inode(task, inode);
}
-static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
+static int tid_fd_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
struct task_struct *task;
struct inode *inode;
@@ -146,29 +173,22 @@ static const struct dentry_operations tid_fd_dentry_operations = {
static int proc_fd_link(struct dentry *dentry, struct path *path)
{
- struct files_struct *files = NULL;
struct task_struct *task;
int ret = -ENOENT;
task = get_proc_task(d_inode(dentry));
if (task) {
- files = get_files_struct(task);
- put_task_struct(task);
- }
-
- if (files) {
unsigned int fd = proc_fd(d_inode(dentry));
struct file *fd_file;
- spin_lock(&files->file_lock);
- fd_file = fcheck_files(files, fd);
+ fd_file = fget_task(task, fd);
if (fd_file) {
*path = fd_file->f_path;
path_get(&fd_file->f_path);
ret = 0;
+ fput(fd_file);
}
- spin_unlock(&files->file_lock);
- put_files_struct(files);
+ put_task_struct(task);
}
return ret;
@@ -199,8 +219,8 @@ static struct dentry *proc_fd_instantiate(struct dentry *dentry,
ei->op.proc_get_link = proc_fd_link;
tid_fd_update_inode(task, inode, data->mode);
- d_set_d_op(dentry, &tid_fd_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return proc_splice_unmountable(inode, dentry,
+ &tid_fd_dentry_operations);
}
static struct dentry *proc_lookupfd_common(struct inode *dir,
@@ -229,7 +249,6 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
instantiate_t instantiate)
{
struct task_struct *p = get_proc_task(file_inode(file));
- struct files_struct *files;
unsigned int fd;
if (!p)
@@ -237,50 +256,65 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
if (!dir_emit_dots(file, ctx))
goto out;
- files = get_files_struct(p);
- if (!files)
- goto out;
- rcu_read_lock();
- for (fd = ctx->pos - 2;
- fd < files_fdtable(files)->max_fds;
- fd++, ctx->pos++) {
+ for (fd = ctx->pos - 2;; fd++) {
struct file *f;
struct fd_data data;
char name[10 + 1];
unsigned int len;
- f = fcheck_files(files, fd);
+ f = fget_task_next(p, &fd);
+ ctx->pos = fd + 2LL;
if (!f)
- continue;
+ break;
data.mode = f->f_mode;
- rcu_read_unlock();
+ fput(f);
data.fd = fd;
len = snprintf(name, sizeof(name), "%u", fd);
if (!proc_fill_cache(file, ctx,
name, len, instantiate, p,
&data))
- goto out_fd_loop;
+ break;
cond_resched();
- rcu_read_lock();
}
- rcu_read_unlock();
-out_fd_loop:
- put_files_struct(files);
out:
put_task_struct(p);
return 0;
}
-static int proc_readfd(struct file *file, struct dir_context *ctx)
+static int proc_readfd_count(struct inode *inode, loff_t *count)
+{
+ struct task_struct *p = get_proc_task(inode);
+ struct fdtable *fdt;
+
+ if (!p)
+ return -ENOENT;
+
+ task_lock(p);
+ if (p->files) {
+ rcu_read_lock();
+
+ fdt = files_fdtable(p->files);
+ *count = bitmap_weight(fdt->open_fds, fdt->max_fds);
+
+ rcu_read_unlock();
+ }
+ task_unlock(p);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static int proc_fd_iterate(struct file *file, struct dir_context *ctx)
{
return proc_readfd_common(file, ctx, proc_fd_instantiate);
}
const struct file_operations proc_fd_operations = {
.read = generic_read_dir,
- .iterate_shared = proc_readfd,
+ .iterate_shared = proc_fd_iterate,
.llseek = generic_file_llseek,
};
@@ -294,12 +328,13 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
* /proc/pid/fd needs a special permission handler so that a process can still
* access /proc/self/fd after it has executed a setuid().
*/
-int proc_fd_permission(struct inode *inode, int mask)
+int proc_fd_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask)
{
struct task_struct *p;
int rv;
- rv = generic_permission(inode, mask);
+ rv = generic_permission(&nop_mnt_idmap, inode, mask);
if (rv == 0)
return rv;
@@ -312,9 +347,20 @@ int proc_fd_permission(struct inode *inode, int mask)
return rv;
}
+static int proc_fd_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ return proc_readfd_count(inode, &stat->size);
+}
+
const struct inode_operations proc_fd_inode_operations = {
.lookup = proc_lookupfd,
.permission = proc_fd_permission,
+ .getattr = proc_fd_getattr,
.setattr = proc_setattr,
};
@@ -325,18 +371,20 @@ static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry,
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUSR);
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUGO);
if (!inode)
return ERR_PTR(-ENOENT);
ei = PROC_I(inode);
ei->fd = data->fd;
+ inode->i_op = &proc_fdinfo_file_inode_operations;
+
inode->i_fop = &proc_fdinfo_file_operations;
tid_fd_update_inode(task, inode, 0);
- d_set_d_op(dentry, &tid_fd_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return proc_splice_unmountable(inode, dentry,
+ &tid_fd_dentry_operations);
}
static struct dentry *
@@ -345,7 +393,7 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
}
-static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
+static int proc_fdinfo_iterate(struct file *file, struct dir_context *ctx)
{
return proc_readfd_common(file, ctx,
proc_fdinfo_instantiate);
@@ -353,11 +401,12 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
const struct inode_operations proc_fdinfo_inode_operations = {
.lookup = proc_lookupfdinfo,
+ .permission = proc_fdinfo_permission,
.setattr = proc_setattr,
};
const struct file_operations proc_fdinfo_operations = {
.read = generic_read_dir,
- .iterate_shared = proc_readfdinfo,
+ .iterate_shared = proc_fdinfo_iterate,
.llseek = generic_file_llseek,
};
diff --git a/fs/proc/fd.h b/fs/proc/fd.h
index f371a602bf58..7e7265f7e06f 100644
--- a/fs/proc/fd.h
+++ b/fs/proc/fd.h
@@ -10,7 +10,8 @@ extern const struct inode_operations proc_fd_inode_operations;
extern const struct file_operations proc_fdinfo_operations;
extern const struct inode_operations proc_fdinfo_inode_operations;
-extern int proc_fd_permission(struct inode *inode, int mask);
+extern int proc_fd_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask);
static inline unsigned int proc_fd(struct inode *inode)
{
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 2f9fa179194d..501889856461 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -115,25 +115,26 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir,
return true;
}
-static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
+static int proc_notify_change(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct iattr *iattr)
{
struct inode *inode = d_inode(dentry);
struct proc_dir_entry *de = PDE(inode);
int error;
- error = setattr_prepare(dentry, iattr);
+ error = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
if (error)
return error;
- setattr_copy(inode, iattr);
- mark_inode_dirty(inode);
+ setattr_copy(&nop_mnt_idmap, inode, iattr);
proc_set_user(de, inode->i_uid, inode->i_gid);
de->mode = inode->i_mode;
return 0;
}
-static int proc_getattr(const struct path *path, struct kstat *stat,
+static int proc_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
@@ -145,7 +146,7 @@ static int proc_getattr(const struct path *path, struct kstat *stat,
}
}
- generic_fillattr(inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
return 0;
}
@@ -164,15 +165,8 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
const char *cp = name, *next;
struct proc_dir_entry *de;
- de = *ret;
- if (!de)
- de = &proc_root;
-
- while (1) {
- next = strchr(cp, '/');
- if (!next)
- break;
-
+ de = *ret ?: &proc_root;
+ while ((next = strchr(cp, '/')) != NULL) {
de = pde_subdir_find(de, cp, next - cp);
if (!de) {
WARN(1, "name '%s'\n", name);
@@ -208,8 +202,8 @@ int proc_alloc_inum(unsigned int *inum)
{
int i;
- i = ida_simple_get(&proc_inum_ida, 0, UINT_MAX - PROC_DYNAMIC_FIRST + 1,
- GFP_KERNEL);
+ i = ida_alloc_max(&proc_inum_ida, UINT_MAX - PROC_DYNAMIC_FIRST,
+ GFP_KERNEL);
if (i < 0)
return i;
@@ -219,10 +213,11 @@ int proc_alloc_inum(unsigned int *inum)
void proc_free_inum(unsigned int inum)
{
- ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
+ ida_free(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
}
-static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int proc_misc_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -259,8 +254,11 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
inode = proc_get_inode(dir->i_sb, de);
if (!inode)
return ERR_PTR(-ENOMEM);
- d_set_d_op(dentry, de->proc_dops);
- return d_splice_alias(inode, dentry);
+ if (de->flags & PROC_ENTRY_FORCE_LOOKUP)
+ return d_splice_alias_ops(inode, dentry,
+ &proc_net_dentry_ops);
+ return d_splice_alias_ops(inode, dentry,
+ &proc_misc_dentry_ops);
}
read_unlock(&proc_subdir_lock);
return ERR_PTR(-ENOENT);
@@ -349,6 +347,17 @@ static const struct file_operations proc_dir_operations = {
.iterate_shared = proc_readdir,
};
+static int proc_net_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
+{
+ return 0;
+}
+
+const struct dentry_operations proc_net_dentry_ops = {
+ .d_revalidate = proc_net_d_revalidate,
+ .d_delete = always_delete_dentry,
+};
+
/*
* proc directories can do almost nothing..
*/
@@ -358,6 +367,25 @@ static const struct inode_operations proc_dir_inode_operations = {
.setattr = proc_notify_change,
};
+static void pde_set_flags(struct proc_dir_entry *pde)
+{
+ const struct proc_ops *proc_ops = pde->proc_ops;
+
+ if (!proc_ops)
+ return;
+
+ if (proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
+ pde->flags |= PROC_ENTRY_PERMANENT;
+ if (proc_ops->proc_read_iter)
+ pde->flags |= PROC_ENTRY_proc_read_iter;
+#ifdef CONFIG_COMPAT
+ if (proc_ops->proc_compat_ioctl)
+ pde->flags |= PROC_ENTRY_proc_compat_ioctl;
+#endif
+ if (proc_ops->proc_lseek)
+ pde->flags |= PROC_ENTRY_proc_lseek;
+}
+
/* returns the registered entry, or frees dp and returns NULL on failure */
struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
struct proc_dir_entry *dp)
@@ -365,6 +393,9 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
if (proc_alloc_inum(&dp->low_ino))
goto out_free_entry;
+ if (!S_ISDIR(dp->mode))
+ pde_set_flags(dp);
+
write_lock(&proc_subdir_lock);
dp->parent = dir;
if (pde_subdir_insert(dir, dp) == false) {
@@ -442,7 +473,9 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
INIT_LIST_HEAD(&ent->pde_openers);
proc_set_user(ent, (*parent)->uid, (*parent)->gid);
- ent->proc_dops = &proc_misc_dentry_ops;
+ /* Revalidate everything under /proc/${pid}/net */
+ if ((*parent)->flags & PROC_ENTRY_FORCE_LOOKUP)
+ pde_force_lookup(ent);
out:
return ent;
@@ -457,9 +490,9 @@ struct proc_dir_entry *proc_symlink(const char *name,
(S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1);
if (ent) {
- ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
+ ent->size = strlen(dest);
+ ent->data = kmemdup(dest, ent->size + 1, GFP_KERNEL);
if (ent->data) {
- strcpy((char*)ent->data,dest);
ent->proc_iops = &proc_link_inode_operations;
ent = proc_register(parent, ent);
} else {
@@ -471,8 +504,8 @@ struct proc_dir_entry *proc_symlink(const char *name,
}
EXPORT_SYMBOL(proc_symlink);
-struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
- struct proc_dir_entry *parent, void *data)
+struct proc_dir_entry *_proc_mkdir(const char *name, umode_t mode,
+ struct proc_dir_entry *parent, void *data, bool force_lookup)
{
struct proc_dir_entry *ent;
@@ -484,10 +517,20 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
ent->data = data;
ent->proc_dir_ops = &proc_dir_operations;
ent->proc_iops = &proc_dir_inode_operations;
+ if (force_lookup) {
+ pde_force_lookup(ent);
+ }
ent = proc_register(parent, ent);
}
return ent;
}
+EXPORT_SYMBOL_GPL(_proc_mkdir);
+
+struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
+ struct proc_dir_entry *parent, void *data)
+{
+ return _proc_mkdir(name, mode, parent, data, false);
+}
EXPORT_SYMBOL_GPL(proc_mkdir_data);
struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
@@ -540,12 +583,6 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
return p;
}
-static inline void pde_set_flags(struct proc_dir_entry *pde)
-{
- if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
- pde->flags |= PROC_ENTRY_PERMANENT;
-}
-
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
struct proc_dir_entry *parent,
const struct proc_ops *proc_ops, void *data)
@@ -556,7 +593,6 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
if (!p)
return NULL;
p->proc_ops = proc_ops;
- pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_data);
@@ -590,7 +626,7 @@ static int proc_seq_release(struct inode *inode, struct file *file)
static const struct proc_ops proc_seq_ops = {
/* not permanent -- can call into arbitrary seq_operations */
.proc_open = proc_seq_open,
- .proc_read = seq_read,
+ .proc_read_iter = seq_read_iter,
.proc_lseek = seq_lseek,
.proc_release = proc_seq_release,
};
@@ -621,7 +657,7 @@ static int proc_single_open(struct inode *inode, struct file *file)
static const struct proc_ops proc_single_ops = {
/* not permanent -- can call into arbitrary ->single_show */
.proc_open = proc_single_open,
- .proc_read = seq_read,
+ .proc_read_iter = seq_read_iter,
.proc_lseek = seq_lseek,
.proc_release = single_release,
};
@@ -662,6 +698,12 @@ void pde_put(struct proc_dir_entry *pde)
}
}
+static void pde_erase(struct proc_dir_entry *pde, struct proc_dir_entry *parent)
+{
+ rb_erase(&pde->subdir_node, &parent->subdir);
+ RB_CLEAR_NODE(&pde->subdir_node);
+}
+
/*
* Remove a /proc entry and free it if it's not currently in use.
*/
@@ -684,7 +726,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
WARN(1, "removing permanent /proc entry '%s'", de->name);
de = NULL;
} else {
- rb_erase(&de->subdir_node, &parent->subdir);
+ pde_erase(de, parent);
if (S_ISDIR(de->mode))
parent->nlink--;
}
@@ -728,19 +770,19 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
root->parent->name, root->name);
return -EINVAL;
}
- rb_erase(&root->subdir_node, &parent->subdir);
+ pde_erase(root, parent);
de = root;
while (1) {
next = pde_subdir_first(de);
if (next) {
- if (unlikely(pde_is_permanent(root))) {
+ if (unlikely(pde_is_permanent(next))) {
write_unlock(&proc_subdir_lock);
WARN(1, "removing permanent /proc entry '%s/%s'",
next->parent->name, next->name);
return -EINVAL;
}
- rb_erase(&next->subdir_node, &de->subdir);
+ pde_erase(next, de);
de = next;
continue;
}
@@ -776,12 +818,6 @@ void proc_remove(struct proc_dir_entry *de)
}
EXPORT_SYMBOL(proc_remove);
-void *PDE_DATA(const struct inode *inode)
-{
- return __PDE_DATA(inode);
-}
-EXPORT_SYMBOL(PDE_DATA);
-
/*
* Pull a user buffer into memory and pass it to the file's write handler if
* one is supplied. The ->write() method is permitted to modify the
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 28d6105e908e..b7634f975d98 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -26,13 +26,10 @@
#include <linux/mount.h>
#include <linux/bug.h>
-#include <linux/uaccess.h>
-
#include "internal.h"
static void proc_evict_inode(struct inode *inode)
{
- struct proc_dir_entry *de;
struct ctl_table_header *head;
struct proc_inode *ei = PROC_I(inode);
@@ -40,21 +37,12 @@ static void proc_evict_inode(struct inode *inode)
clear_inode(inode);
/* Stop tracking associated processes */
- if (ei->pid) {
+ if (ei->pid)
proc_pid_evict_inode(ei);
- ei->pid = NULL;
- }
-
- /* Let go of any associated proc directory entry */
- de = ei->pde;
- if (de) {
- pde_put(de);
- ei->pde = NULL;
- }
head = ei->sysctl;
if (head) {
- RCU_INIT_POINTER(ei->sysctl, NULL);
+ WRITE_ONCE(ei->sysctl, NULL);
proc_sys_evict_inode(inode, head);
}
}
@@ -66,7 +54,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
{
struct proc_inode *ei;
- ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, proc_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
ei->pid = NULL;
@@ -82,6 +70,13 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
static void proc_free_inode(struct inode *inode)
{
+ struct proc_inode *ei = PROC_I(inode);
+
+ if (ei->pid)
+ put_pid(ei->pid);
+ /* Let go of any associated proc directory entry */
+ if (ei->pde)
+ pde_put(ei->pde);
kmem_cache_free(proc_inode_cachep, PROC_I(inode));
}
@@ -97,7 +92,7 @@ void __init proc_init_kmemcache(void)
proc_inode_cachep = kmem_cache_create("proc_inode_cache",
sizeof(struct proc_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT|
+ SLAB_ACCOUNT|
SLAB_PANIC),
init_once);
pde_opener_cache =
@@ -112,18 +107,15 @@ void __init proc_init_kmemcache(void)
void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
{
- struct inode *inode;
- struct proc_inode *ei;
struct hlist_node *node;
struct super_block *old_sb = NULL;
rcu_read_lock();
- for (;;) {
+ while ((node = hlist_first_rcu(inodes))) {
+ struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes);
struct super_block *sb;
- node = hlist_first_rcu(inodes);
- if (!node)
- break;
- ei = hlist_entry(node, struct proc_inode, sibling_inodes);
+ struct inode *inode;
+
spin_lock(lock);
hlist_del_init_rcu(&ei->sibling_inodes);
spin_unlock(lock);
@@ -195,7 +187,7 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root)
const struct super_operations proc_sops = {
.alloc_inode = proc_alloc_inode,
.free_inode = proc_free_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = proc_evict_inode,
.statfs = simple_statfs,
.show_options = proc_show_options,
@@ -214,7 +206,15 @@ static void unuse_pde(struct proc_dir_entry *pde)
complete(pde->pde_unload_completion);
}
-/* pde is locked on entry, unlocked on exit */
+/*
+ * At most 2 contexts can enter this function: the one doing the last
+ * close on the descriptor and whoever is deleting PDE itself.
+ *
+ * First to enter calls ->proc_release hook and signals its completion
+ * to the second one which waits and then does nothing.
+ *
+ * PDE is locked on entry, unlocked on exit.
+ */
static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
__releases(&pde->pde_unload_lock)
{
@@ -224,9 +224,6 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
*
* rmmod (remove_proc_entry() et al) can't delete an entry and proceed:
* "struct file" needs to be available at the right moment.
- *
- * Therefore, first process to enter this function does ->release() and
- * signals its completion to the other process which does nothing.
*/
if (pdeo->closing) {
/* somebody else is doing that, just wait */
@@ -240,10 +237,12 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
pdeo->closing = true;
spin_unlock(&pde->pde_unload_lock);
+
file = pdeo->file;
pde->proc_ops->proc_release(file_inode(file), file);
+
spin_lock(&pde->pde_unload_lock);
- /* After ->release. */
+ /* Strictly after ->proc_release, see above. */
list_del(&pdeo->lh);
c = pdeo->c;
spin_unlock(&pde->pde_unload_lock);
@@ -273,35 +272,38 @@ void proc_entry_rundown(struct proc_dir_entry *de)
spin_unlock(&de->pde_unload_lock);
}
-static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence)
-{
- typeof_member(struct proc_ops, proc_lseek) lseek;
-
- lseek = pde->proc_ops->proc_lseek;
- if (!lseek)
- lseek = default_llseek;
- return lseek(file, offset, whence);
-}
-
static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
loff_t rv = -EINVAL;
if (pde_is_permanent(pde)) {
- return pde_lseek(pde, file, offset, whence);
+ return pde->proc_ops->proc_lseek(file, offset, whence);
} else if (use_pde(pde)) {
- rv = pde_lseek(pde, file, offset, whence);
+ rv = pde->proc_ops->proc_lseek(file, offset, whence);
unuse_pde(pde);
}
return rv;
}
-static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
+static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- typeof_member(struct proc_ops, proc_read) read;
+ struct proc_dir_entry *pde = PDE(file_inode(iocb->ki_filp));
+ ssize_t ret;
+
+ if (pde_is_permanent(pde))
+ return pde->proc_ops->proc_read_iter(iocb, iter);
+
+ if (!use_pde(pde))
+ return -EIO;
+ ret = pde->proc_ops->proc_read_iter(iocb, iter);
+ unuse_pde(pde);
+ return ret;
+}
- read = pde->proc_ops->proc_read;
+static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+ const auto read = pde->proc_ops->proc_read;
if (read)
return read(file, buf, count, ppos);
return -EIO;
@@ -323,9 +325,7 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count,
static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
- typeof_member(struct proc_ops, proc_write) write;
-
- write = pde->proc_ops->proc_write;
+ const auto write = pde->proc_ops->proc_write;
if (write)
return write(file, buf, count, ppos);
return -EIO;
@@ -347,9 +347,7 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t
static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts)
{
- typeof_member(struct proc_ops, proc_poll) poll;
-
- poll = pde->proc_ops->proc_poll;
+ const auto poll = pde->proc_ops->proc_poll;
if (poll)
return poll(file, pts);
return DEFAULT_POLLMASK;
@@ -371,9 +369,7 @@ static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
{
- typeof_member(struct proc_ops, proc_ioctl) ioctl;
-
- ioctl = pde->proc_ops->proc_ioctl;
+ const auto ioctl = pde->proc_ops->proc_ioctl;
if (ioctl)
return ioctl(file, cmd, arg);
return -ENOTTY;
@@ -396,9 +392,7 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
#ifdef CONFIG_COMPAT
static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
{
- typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;
-
- compat_ioctl = pde->proc_ops->proc_compat_ioctl;
+ const auto compat_ioctl = pde->proc_ops->proc_compat_ioctl;
if (compat_ioctl)
return compat_ioctl(file, cmd, arg);
return -ENOTTY;
@@ -420,9 +414,7 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned
static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma)
{
- typeof_member(struct proc_ops, proc_mmap) mmap;
-
- mmap = pde->proc_ops->proc_mmap;
+ const auto mmap = pde->proc_ops->proc_mmap;
if (mmap)
return mmap(file, vma);
return -EIO;
@@ -447,15 +439,13 @@ pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned lo
unsigned long len, unsigned long pgoff,
unsigned long flags)
{
- typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;
+ if (pde->proc_ops->proc_get_unmapped_area)
+ return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags);
- get_area = pde->proc_ops->proc_get_unmapped_area;
#ifdef CONFIG_MMU
- if (!get_area)
- get_area = current->mm->get_unmapped_area;
+ return mm_get_unmapped_area(file, orig_addr, len, pgoff, flags);
#endif
- if (get_area)
- return get_area(file, orig_addr, len, pgoff, flags);
+
return orig_addr;
}
@@ -478,13 +468,14 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
static int proc_reg_open(struct inode *inode, struct file *file)
{
- struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
struct proc_dir_entry *pde = PDE(inode);
int rv = 0;
typeof_member(struct proc_ops, proc_open) open;
- typeof_member(struct proc_ops, proc_release) release;
struct pde_opener *pdeo;
+ if (!pde_has_proc_lseek(pde))
+ file->f_mode &= ~FMODE_LSEEK;
+
if (pde_is_permanent(pde)) {
open = pde->proc_ops->proc_open;
if (open)
@@ -492,9 +483,6 @@ static int proc_reg_open(struct inode *inode, struct file *file)
return rv;
}
- if (fs_info->pidonly == PROC_PIDONLY_ON)
- return -ENOENT;
-
/*
* Ensure that
* 1) PDE's ->release hook will be called no matter what
@@ -509,7 +497,7 @@ static int proc_reg_open(struct inode *inode, struct file *file)
if (!use_pde(pde))
return -ENOENT;
- release = pde->proc_ops->proc_release;
+ const auto release = pde->proc_ops->proc_release;
if (release) {
pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL);
if (!pdeo) {
@@ -546,12 +534,9 @@ static int proc_reg_release(struct inode *inode, struct file *file)
struct pde_opener *pdeo;
if (pde_is_permanent(pde)) {
- typeof_member(struct proc_ops, proc_release) release;
-
- release = pde->proc_ops->proc_release;
- if (release) {
+ const auto release = pde->proc_ops->proc_release;
+ if (release)
return release(inode, file);
- }
return 0;
}
@@ -572,9 +557,19 @@ static const struct file_operations proc_reg_file_ops = {
.write = proc_reg_write,
.poll = proc_reg_poll,
.unlocked_ioctl = proc_reg_unlocked_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = proc_reg_compat_ioctl,
-#endif
+ .mmap = proc_reg_mmap,
+ .get_unmapped_area = proc_reg_get_unmapped_area,
+ .open = proc_reg_open,
+ .release = proc_reg_release,
+};
+
+static const struct file_operations proc_iter_file_ops = {
+ .llseek = proc_reg_llseek,
+ .read_iter = proc_reg_read_iter,
+ .write = proc_reg_write,
+ .splice_read = copy_splice_read,
+ .poll = proc_reg_poll,
+ .unlocked_ioctl = proc_reg_unlocked_ioctl,
.mmap = proc_reg_mmap,
.get_unmapped_area = proc_reg_get_unmapped_area,
.open = proc_reg_open,
@@ -582,12 +577,27 @@ static const struct file_operations proc_reg_file_ops = {
};
#ifdef CONFIG_COMPAT
-static const struct file_operations proc_reg_file_ops_no_compat = {
+static const struct file_operations proc_reg_file_ops_compat = {
.llseek = proc_reg_llseek,
.read = proc_reg_read,
.write = proc_reg_write,
.poll = proc_reg_poll,
.unlocked_ioctl = proc_reg_unlocked_ioctl,
+ .compat_ioctl = proc_reg_compat_ioctl,
+ .mmap = proc_reg_mmap,
+ .get_unmapped_area = proc_reg_get_unmapped_area,
+ .open = proc_reg_open,
+ .release = proc_reg_release,
+};
+
+static const struct file_operations proc_iter_file_ops_compat = {
+ .llseek = proc_reg_llseek,
+ .read_iter = proc_reg_read_iter,
+ .splice_read = copy_splice_read,
+ .write = proc_reg_write,
+ .poll = proc_reg_poll,
+ .unlocked_ioctl = proc_reg_unlocked_ioctl,
+ .compat_ioctl = proc_reg_compat_ioctl,
.mmap = proc_reg_mmap,
.get_unmapped_area = proc_reg_get_unmapped_area,
.open = proc_reg_open,
@@ -619,42 +629,52 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
{
struct inode *inode = new_inode(sb);
- if (inode) {
- inode->i_ino = de->low_ino;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
- PROC_I(inode)->pde = de;
+ if (!inode) {
+ pde_put(de);
+ return NULL;
+ }
- if (is_empty_pde(de)) {
- make_empty_dir_inode(inode);
- return inode;
- }
- if (de->mode) {
- inode->i_mode = de->mode;
- inode->i_uid = de->uid;
- inode->i_gid = de->gid;
- }
- if (de->size)
- inode->i_size = de->size;
- if (de->nlink)
- set_nlink(inode, de->nlink);
+ inode->i_private = de->data;
+ inode->i_ino = de->low_ino;
+ simple_inode_init_ts(inode);
+ PROC_I(inode)->pde = de;
+ if (is_empty_pde(de)) {
+ make_empty_dir_inode(inode);
+ return inode;
+ }
- if (S_ISREG(inode->i_mode)) {
- inode->i_op = de->proc_iops;
+ if (de->mode) {
+ inode->i_mode = de->mode;
+ inode->i_uid = de->uid;
+ inode->i_gid = de->gid;
+ }
+ if (de->size)
+ inode->i_size = de->size;
+ if (de->nlink)
+ set_nlink(inode, de->nlink);
+
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_op = de->proc_iops;
+ if (pde_has_proc_read_iter(de))
+ inode->i_fop = &proc_iter_file_ops;
+ else
inode->i_fop = &proc_reg_file_ops;
#ifdef CONFIG_COMPAT
- if (!de->proc_ops->proc_compat_ioctl) {
- inode->i_fop = &proc_reg_file_ops_no_compat;
- }
+ if (pde_has_proc_compat_ioctl(de)) {
+ if (pde_has_proc_read_iter(de))
+ inode->i_fop = &proc_iter_file_ops_compat;
+ else
+ inode->i_fop = &proc_reg_file_ops_compat;
+ }
#endif
- } else if (S_ISDIR(inode->i_mode)) {
- inode->i_op = de->proc_iops;
- inode->i_fop = de->proc_dir_ops;
- } else if (S_ISLNK(inode->i_mode)) {
- inode->i_op = de->proc_iops;
- inode->i_fop = NULL;
- } else
- BUG();
- } else
- pde_put(de);
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = de->proc_iops;
+ inode->i_fop = de->proc_dir_ops;
+ } else if (S_ISLNK(inode->i_mode)) {
+ inode->i_op = de->proc_iops;
+ inode->i_fop = NULL;
+ } else {
+ BUG();
+ }
return inode;
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 917cc85e3466..c1e8eb984da8 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -13,6 +13,7 @@
#include <linux/binfmts.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
+#include <linux/mm.h>
struct ctl_table_header;
struct mempolicy;
@@ -43,7 +44,6 @@ struct proc_dir_entry {
const struct proc_ops *proc_ops;
const struct file_operations *proc_dir_ops;
};
- const struct dentry_operations *proc_dops;
union {
const struct seq_operations *seq_ops;
int (*single_show)(struct seq_file *, void *);
@@ -79,6 +79,30 @@ static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
return pde->flags & PROC_ENTRY_PERMANENT;
}
+static inline void pde_make_permanent(struct proc_dir_entry *pde)
+{
+ pde->flags |= PROC_ENTRY_PERMANENT;
+}
+
+static inline bool pde_has_proc_read_iter(const struct proc_dir_entry *pde)
+{
+ return pde->flags & PROC_ENTRY_proc_read_iter;
+}
+
+static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde)
+{
+#ifdef CONFIG_COMPAT
+ return pde->flags & PROC_ENTRY_proc_compat_ioctl;
+#else
+ return false;
+#endif
+}
+
+static inline bool pde_has_proc_lseek(const struct proc_dir_entry *pde)
+{
+ return pde->flags & PROC_ENTRY_proc_lseek;
+}
+
extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);
@@ -87,7 +111,7 @@ union proc_op {
int (*proc_show)(struct seq_file *m,
struct pid_namespace *ns, struct pid *pid,
struct task_struct *task);
- const char *lsm;
+ int lsmid;
};
struct proc_inode {
@@ -96,7 +120,7 @@ struct proc_inode {
union proc_op op;
struct proc_dir_entry *pde;
struct ctl_table_header *sysctl;
- struct ctl_table *sysctl_entry;
+ const struct ctl_table *sysctl_entry;
struct hlist_node sibling_inodes;
const struct proc_ns_operations *ns_ops;
struct inode vfs_inode;
@@ -115,11 +139,6 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode)
return PROC_I(inode)->pde;
}
-static inline void *__PDE_DATA(const struct inode *inode)
-{
- return PDE(inode)->data;
-}
-
static inline struct pid *proc_pid(const struct inode *inode)
{
return PROC_I(inode)->pid;
@@ -142,6 +161,80 @@ unsigned name_to_int(const struct qstr *qstr);
/* Worst case buffer size needed for holding an integer. */
#define PROC_NUMBUF 13
+#ifdef CONFIG_PAGE_MAPCOUNT
+/**
+ * folio_precise_page_mapcount() - Number of mappings of this folio page.
+ * @folio: The folio.
+ * @page: The page.
+ *
+ * The number of present user page table entries that reference this page
+ * as tracked via the RMAP: either referenced directly (PTE) or as part of
+ * a larger area that covers this page (e.g., PMD).
+ *
+ * Use this function only for the calculation of existing statistics
+ * (USS, PSS, mapcount_max) and for debugging purposes (/proc/kpagecount).
+ *
+ * Do not add new users.
+ *
+ * Returns: The number of mappings of this folio page. 0 for
+ * folios that are not mapped to user space or are not tracked via the RMAP
+ * (e.g., shared zeropage).
+ */
+static inline int folio_precise_page_mapcount(struct folio *folio,
+ struct page *page)
+{
+ int mapcount = atomic_read(&page->_mapcount) + 1;
+
+ if (page_mapcount_is_type(mapcount))
+ mapcount = 0;
+ if (folio_test_large(folio))
+ mapcount += folio_entire_mapcount(folio);
+
+ return mapcount;
+}
+#else /* !CONFIG_PAGE_MAPCOUNT */
+static inline int folio_precise_page_mapcount(struct folio *folio,
+ struct page *page)
+{
+ BUILD_BUG();
+}
+#endif /* CONFIG_PAGE_MAPCOUNT */
+
+/**
+ * folio_average_page_mapcount() - Average number of mappings per page in this
+ * folio
+ * @folio: The folio.
+ *
+ * The average number of user page table entries that reference each page in
+ * this folio as tracked via the RMAP: either referenced directly (PTE) or
+ * as part of a larger area that covers this page (e.g., PMD).
+ *
+ * The average is calculated by rounding to the nearest integer; however,
+ * to avoid duplicated code in current callers, the average is at least
+ * 1 if any page of the folio is mapped.
+ *
+ * Returns: The average number of mappings per page in this folio.
+ */
+static inline int folio_average_page_mapcount(struct folio *folio)
+{
+ int mapcount, entire_mapcount, avg;
+
+ if (!folio_test_large(folio))
+ return atomic_read(&folio->_mapcount) + 1;
+
+ mapcount = folio_large_mapcount(folio);
+ if (unlikely(mapcount <= 0))
+ return 0;
+ entire_mapcount = folio_entire_mapcount(folio);
+ if (mapcount <= entire_mapcount)
+ return entire_mapcount;
+ mapcount -= entire_mapcount;
+
+ /* Round to closest integer ... */
+ avg = ((unsigned int)mapcount + folio_large_nr_pages(folio) / 2) >> folio_large_order(folio);
+ /* ... but return at least 1. */
+ return max_t(int, avg + entire_mapcount, 1);
+}
/*
* array.c
*/
@@ -162,8 +255,10 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
* base.c
*/
extern const struct dentry_operations pid_dentry_operations;
-extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
-extern int proc_setattr(struct dentry *, struct iattr *);
+extern int pid_getattr(struct mnt_idmap *, const struct path *,
+ struct kstat *, u32, unsigned int);
+extern int proc_setattr(struct mnt_idmap *, struct dentry *,
+ struct iattr *);
extern void proc_pid_evict_inode(struct proc_inode *);
extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
extern void pid_update_inode(struct task_struct *, struct inode *);
@@ -190,10 +285,9 @@ struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_e
extern int proc_readdir(struct file *, struct dir_context *);
int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *);
-static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
+static inline void pde_get(struct proc_dir_entry *pde)
{
refcount_inc(&pde->refcnt);
- return pde;
}
extern void pde_put(struct proc_dir_entry *);
@@ -279,18 +373,27 @@ static inline void proc_tty_init(void) {}
extern struct proc_dir_entry proc_root;
extern void proc_self_init(void);
+extern unsigned self_inum, thread_self_inum;
/*
* task_[no]mmu.c
*/
struct mem_size_stats;
+
+struct proc_maps_locking_ctx {
+ struct mm_struct *mm;
+#ifdef CONFIG_PER_VMA_LOCK
+ bool mmap_locked;
+ struct vm_area_struct *locked_vma;
+#endif
+};
+
struct proc_maps_private {
struct inode *inode;
struct task_struct *task;
- struct mm_struct *mm;
-#ifdef CONFIG_MMU
- struct vm_area_struct *tail_vma;
-#endif
+ struct vma_iterator iter;
+ loff_t last_pos;
+ struct proc_maps_locking_ctx lock_ctx;
#ifdef CONFIG_NUMA
struct mempolicy *task_mempolicy;
#endif
@@ -310,3 +413,22 @@ extern unsigned long task_statm(struct mm_struct *,
unsigned long *, unsigned long *,
unsigned long *, unsigned long *);
extern void task_mem(struct seq_file *, struct mm_struct *);
+
+extern const struct dentry_operations proc_net_dentry_ops;
+static inline void pde_force_lookup(struct proc_dir_entry *pde)
+{
+ /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
+ pde->flags |= PROC_ENTRY_FORCE_LOOKUP;
+}
+
+/*
+ * Add a new procfs dentry that can't serve as a mountpoint. That should
+ * encompass anything that is ephemeral and can just disappear while the
+ * process is still around.
+ */
+static inline struct dentry *proc_splice_unmountable(struct inode *inode,
+ struct dentry *dentry, const struct dentry_operations *d_ops)
+{
+ dont_mount(dentry);
+ return d_splice_alias_ops(inode, dentry, d_ops);
+}
diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c
index cb0edc7cbf09..714a22ded8a8 100644
--- a/fs/proc/interrupts.c
+++ b/fs/proc/interrupts.c
@@ -11,13 +11,13 @@
*/
static void *int_seq_start(struct seq_file *f, loff_t *pos)
{
- return (*pos <= nr_irqs) ? pos : NULL;
+ return *pos <= irq_get_nr_irqs() ? pos : NULL;
}
static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
{
(*pos)++;
- if (*pos > nr_irqs)
+ if (*pos > irq_get_nr_irqs())
return NULL;
return pos;
}
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 8ba492d44e68..728630b10fdf 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -10,7 +10,7 @@
* Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com>
*/
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
#include <linux/mm.h>
#include <linux/proc_fs.h>
#include <linux/kcore.h>
@@ -18,14 +18,13 @@
#include <linux/capability.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
-#include <linux/notifier.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/printk.h>
#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/uaccess.h>
+#include <linux/uio.h>
#include <asm/io.h>
#include <linux/list.h>
#include <linux/ioport.h>
@@ -35,8 +34,6 @@
#include <asm/sections.h>
#include "internal.h"
-#define CORE_STR "CORE"
-
#ifndef ELF_CORE_EFLAGS
#define ELF_CORE_EFLAGS 0
#endif
@@ -51,8 +48,26 @@ static struct proc_dir_entry *proc_root_kcore;
#define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
#endif
+#ifndef kc_xlate_dev_mem_ptr
+#define kc_xlate_dev_mem_ptr kc_xlate_dev_mem_ptr
+static inline void *kc_xlate_dev_mem_ptr(phys_addr_t phys)
+{
+ return __va(phys);
+}
+#endif
+#ifndef kc_unxlate_dev_mem_ptr
+#define kc_unxlate_dev_mem_ptr kc_unxlate_dev_mem_ptr
+static inline void kc_unxlate_dev_mem_ptr(phys_addr_t phys, void *virt)
+{
+}
+#endif
+
static LIST_HEAD(kclist_head);
-static DECLARE_RWSEM(kclist_lock);
+static int kcore_nphdr;
+static size_t kcore_phdrs_len;
+static size_t kcore_notes_len;
+static size_t kcore_data_offset;
+DEFINE_STATIC_PERCPU_RWSEM(kclist_lock);
static int kcore_need_update = 1;
/*
@@ -88,33 +103,34 @@ void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
list_add_tail(&new->list, &kclist_head);
}
-static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len,
- size_t *data_offset)
+static void update_kcore_size(void)
{
size_t try, size;
struct kcore_list *m;
- *nphdr = 1; /* PT_NOTE */
+ kcore_nphdr = 1; /* PT_NOTE */
size = 0;
list_for_each_entry(m, &kclist_head, list) {
try = kc_vaddr_to_offset((size_t)m->addr + m->size);
if (try > size)
size = try;
- *nphdr = *nphdr + 1;
+ kcore_nphdr++;
}
- *phdrs_len = *nphdr * sizeof(struct elf_phdr);
- *notes_len = (4 * sizeof(struct elf_note) +
- 3 * ALIGN(sizeof(CORE_STR), 4) +
- VMCOREINFO_NOTE_NAME_BYTES +
- ALIGN(sizeof(struct elf_prstatus), 4) +
- ALIGN(sizeof(struct elf_prpsinfo), 4) +
- ALIGN(arch_task_struct_size, 4) +
- ALIGN(vmcoreinfo_size, 4));
- *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
- *notes_len);
- return *data_offset + size;
+ kcore_phdrs_len = kcore_nphdr * sizeof(struct elf_phdr);
+ kcore_notes_len = (4 * sizeof(struct elf_note) +
+ ALIGN(sizeof(NN_PRSTATUS), 4) +
+ ALIGN(sizeof(NN_PRPSINFO), 4) +
+ ALIGN(sizeof(NN_TASKSTRUCT), 4) +
+ VMCOREINFO_NOTE_NAME_BYTES +
+ ALIGN(sizeof(struct elf_prstatus), 4) +
+ ALIGN(sizeof(struct elf_prpsinfo), 4) +
+ ALIGN(arch_task_struct_size, 4) +
+ ALIGN(vmcoreinfo_size, 4));
+ kcore_data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + kcore_phdrs_len +
+ kcore_notes_len);
+ proc_root_kcore->size = kcore_data_offset + size;
}
#ifdef CONFIG_HIGHMEM
@@ -193,8 +209,6 @@ kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
return 1;
p = pfn_to_page(pfn);
- if (!memmap_valid_within(pfn, p, page_zone(p)))
- return 1;
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
@@ -202,7 +216,7 @@ kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
ent->addr = (unsigned long)page_to_virt(p);
ent->size = nr_pages << PAGE_SHIFT;
- if (!virt_addr_valid(ent->addr))
+ if (!virt_addr_valid((void *)ent->addr))
goto free_out;
/* cut not-mapped area. ....from ppc-32 code. */
@@ -238,7 +252,7 @@ static int kcore_ram_list(struct list_head *list)
int nid, ret;
unsigned long end_pfn;
- /* Not inialized....update now */
+ /* Not initialized....update now */
/* find out "max pfn" */
end_pfn = 0;
for_each_node_state(nid, N_MEMORY) {
@@ -259,12 +273,10 @@ static int kcore_update_ram(void)
{
LIST_HEAD(list);
LIST_HEAD(garbage);
- int nphdr;
- size_t phdrs_len, notes_len, data_offset;
struct kcore_list *tmp, *pos;
int ret = 0;
- down_write(&kclist_lock);
+ percpu_down_write(&kclist_lock);
if (!xchg(&kcore_need_update, 0))
goto out;
@@ -282,11 +294,10 @@ static int kcore_update_ram(void)
}
list_splice_tail(&list, &kclist_head);
- proc_root_kcore->size = get_kcore_size(&nphdr, &phdrs_len, &notes_len,
- &data_offset);
+ update_kcore_size();
out:
- up_write(&kclist_lock);
+ percpu_up_write(&kclist_lock);
list_for_each_entry_safe(pos, tmp, &garbage, list) {
list_del(&pos->list);
kfree(pos);
@@ -310,24 +321,29 @@ static void append_kcore_note(char *notes, size_t *i, const char *name,
*i = ALIGN(*i + descsz, 4);
}
-static ssize_t
-read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
+static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
{
+ struct file *file = iocb->ki_filp;
char *buf = file->private_data;
- size_t phdrs_offset, notes_offset, data_offset;
- size_t phdrs_len, notes_len;
+ loff_t *fpos = &iocb->ki_pos;
+ size_t phdrs_offset, notes_offset;
+ size_t page_offline_frozen = 1;
struct kcore_list *m;
size_t tsz;
- int nphdr;
unsigned long start;
+ size_t buflen = iov_iter_count(iter);
size_t orig_buflen = buflen;
int ret = 0;
- down_read(&kclist_lock);
+ percpu_down_read(&kclist_lock);
+ /*
+ * Don't race against drivers that set PageOffline() and expect no
+ * further page access.
+ */
+ page_offline_freeze();
- get_kcore_size(&nphdr, &phdrs_len, &notes_len, &data_offset);
phdrs_offset = sizeof(struct elfhdr);
- notes_offset = phdrs_offset + phdrs_len;
+ notes_offset = phdrs_offset + kcore_phdrs_len;
/* ELF file header. */
if (buflen && *fpos < sizeof(struct elfhdr)) {
@@ -349,25 +365,24 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
.e_flags = ELF_CORE_EFLAGS,
.e_ehsize = sizeof(struct elfhdr),
.e_phentsize = sizeof(struct elf_phdr),
- .e_phnum = nphdr,
+ .e_phnum = kcore_nphdr,
};
tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos);
- if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) {
+ if (copy_to_iter((char *)&ehdr + *fpos, tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
- buffer += tsz;
buflen -= tsz;
*fpos += tsz;
}
/* ELF program headers. */
- if (buflen && *fpos < phdrs_offset + phdrs_len) {
+ if (buflen && *fpos < phdrs_offset + kcore_phdrs_len) {
struct elf_phdr *phdrs, *phdr;
- phdrs = kzalloc(phdrs_len, GFP_KERNEL);
+ phdrs = kzalloc(kcore_phdrs_len, GFP_KERNEL);
if (!phdrs) {
ret = -ENOMEM;
goto out;
@@ -375,18 +390,16 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
phdrs[0].p_type = PT_NOTE;
phdrs[0].p_offset = notes_offset;
- phdrs[0].p_filesz = notes_len;
+ phdrs[0].p_filesz = kcore_notes_len;
phdr = &phdrs[1];
list_for_each_entry(m, &kclist_head, list) {
phdr->p_type = PT_LOAD;
phdr->p_flags = PF_R | PF_W | PF_X;
- phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset;
- if (m->type == KCORE_REMAP)
- phdr->p_vaddr = (size_t)m->vaddr;
- else
- phdr->p_vaddr = (size_t)m->addr;
- if (m->type == KCORE_RAM || m->type == KCORE_REMAP)
+ phdr->p_offset = kc_vaddr_to_offset(m->addr)
+ + kcore_data_offset;
+ phdr->p_vaddr = (size_t)m->addr;
+ if (m->type == KCORE_RAM)
phdr->p_paddr = __pa(m->addr);
else if (m->type == KCORE_TEXT)
phdr->p_paddr = __pa_symbol(m->addr);
@@ -397,22 +410,22 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
phdr++;
}
- tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos);
- if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset,
- tsz)) {
+ tsz = min_t(size_t, buflen,
+ phdrs_offset + kcore_phdrs_len - *fpos);
+ if (copy_to_iter((char *)phdrs + *fpos - phdrs_offset, tsz,
+ iter) != tsz) {
kfree(phdrs);
ret = -EFAULT;
goto out;
}
kfree(phdrs);
- buffer += tsz;
buflen -= tsz;
*fpos += tsz;
}
/* ELF note segment. */
- if (buflen && *fpos < notes_offset + notes_len) {
+ if (buflen && *fpos < notes_offset + kcore_notes_len) {
struct elf_prstatus prstatus = {};
struct elf_prpsinfo prpsinfo = {
.pr_sname = 'R',
@@ -421,20 +434,20 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
char *notes;
size_t i = 0;
- strlcpy(prpsinfo.pr_psargs, saved_command_line,
+ strscpy(prpsinfo.pr_psargs, saved_command_line,
sizeof(prpsinfo.pr_psargs));
- notes = kzalloc(notes_len, GFP_KERNEL);
+ notes = kzalloc(kcore_notes_len, GFP_KERNEL);
if (!notes) {
ret = -ENOMEM;
goto out;
}
- append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus,
+ append_kcore_note(notes, &i, NN_PRSTATUS, NT_PRSTATUS, &prstatus,
sizeof(prstatus));
- append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo,
+ append_kcore_note(notes, &i, NN_PRPSINFO, NT_PRPSINFO, &prpsinfo,
sizeof(prpsinfo));
- append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current,
+ append_kcore_note(notes, &i, NN_TASKSTRUCT, NT_TASKSTRUCT, current,
arch_task_struct_size);
/*
* vmcoreinfo_size is mostly constant after init time, but it
@@ -445,17 +458,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
*/
append_kcore_note(notes, &i, VMCOREINFO_NOTE_NAME, 0,
vmcoreinfo_data,
- min(vmcoreinfo_size, notes_len - i));
+ min(vmcoreinfo_size, kcore_notes_len - i));
- tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
- if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
+ tsz = min_t(size_t, buflen,
+ notes_offset + kcore_notes_len - *fpos);
+ if (copy_to_iter(notes + *fpos - notes_offset, tsz, iter) != tsz) {
kfree(notes);
ret = -EFAULT;
goto out;
}
kfree(notes);
- buffer += tsz;
buflen -= tsz;
*fpos += tsz;
}
@@ -464,81 +477,156 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
* Check to see if our file offset matches with any of
* the addresses in the elf_phdr on our list.
*/
- start = kc_offset_to_vaddr(*fpos - data_offset);
+ start = kc_offset_to_vaddr(*fpos - kcore_data_offset);
if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
tsz = buflen;
m = NULL;
while (buflen) {
+ struct page *page;
+ unsigned long pfn;
+ phys_addr_t phys;
+ void *__start;
+
/*
* If this is the first iteration or the address is not within
* the previous entry, search for a matching entry.
*/
if (!m || start < m->addr || start >= m->addr + m->size) {
- list_for_each_entry(m, &kclist_head, list) {
- if (start >= m->addr &&
- start < m->addr + m->size)
+ struct kcore_list *pos;
+
+ m = NULL;
+ list_for_each_entry(pos, &kclist_head, list) {
+ if (start >= pos->addr &&
+ start < pos->addr + pos->size) {
+ m = pos;
break;
+ }
}
}
- if (&m->list == &kclist_head) {
- if (clear_user(buffer, tsz)) {
- ret = -EFAULT;
- goto out;
- }
- m = NULL; /* skip the list anchor */
- } else if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) {
- if (clear_user(buffer, tsz)) {
+ if (page_offline_frozen++ % MAX_ORDER_NR_PAGES == 0) {
+ page_offline_thaw();
+ cond_resched();
+ page_offline_freeze();
+ }
+
+ if (!m) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
- } else if (m->type == KCORE_VMALLOC) {
- vread(buf, (char *)start, tsz);
- /* we have to zero-fill user buffer even if no read */
- if (copy_to_user(buffer, buf, tsz)) {
- ret = -EFAULT;
- goto out;
+ goto skip;
+ }
+
+ switch (m->type) {
+ case KCORE_VMALLOC:
+ {
+ const char *src = (char *)start;
+ size_t read = 0, left = tsz;
+
+ /*
+ * vmalloc uses spinlocks, so we optimistically try to
+ * read memory. If this fails, fault pages in and try
+ * again until we are done.
+ */
+ while (true) {
+ read += vread_iter(iter, src, left);
+ if (read == tsz)
+ break;
+
+ src += read;
+ left -= read;
+
+ if (fault_in_iov_iter_writeable(iter, left)) {
+ ret = -EFAULT;
+ goto out;
+ }
}
- } else if (m->type == KCORE_USER) {
+ break;
+ }
+ case KCORE_USER:
/* User page is handled prior to normal kernel page: */
- if (copy_to_user(buffer, (char *)start, tsz)) {
+ if (copy_to_iter((char *)start, tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
- } else {
- if (kern_addr_valid(start)) {
- /*
- * Using bounce buffer to bypass the
- * hardened user copy kernel text checks.
- */
- if (probe_kernel_read(buf, (void *) start, tsz)) {
- if (clear_user(buffer, tsz)) {
- ret = -EFAULT;
- goto out;
- }
- } else {
- if (copy_to_user(buffer, buf, tsz)) {
+ break;
+ case KCORE_RAM:
+ phys = __pa(start);
+ pfn = phys >> PAGE_SHIFT;
+ page = pfn_to_online_page(pfn);
+
+ /*
+ * Don't read offline sections, logically offline pages
+ * (e.g., inflated in a balloon), hwpoisoned pages,
+ * and explicitly excluded physical ranges.
+ */
+ if (!page || PageOffline(page) ||
+ is_page_hwpoison(page) || !pfn_is_ram(pfn) ||
+ pfn_is_unaccepted_memory(pfn)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
+ ret = -EFAULT;
+ goto out;
+ }
+ break;
+ }
+ fallthrough;
+ case KCORE_VMEMMAP:
+ case KCORE_TEXT:
+ if (m->type == KCORE_RAM) {
+ __start = kc_xlate_dev_mem_ptr(phys);
+ if (!__start) {
+ ret = -ENOMEM;
+ if (iov_iter_zero(tsz, iter) != tsz)
ret = -EFAULT;
- goto out;
- }
+ goto out;
}
} else {
- if (clear_user(buffer, tsz)) {
+ __start = (void *)start;
+ }
+
+ /*
+ * Sadly we must use a bounce buffer here to be able to
+ * make use of copy_from_kernel_nofault(), as these
+ * memory regions might not always be mapped on all
+ * architectures.
+ */
+ ret = copy_from_kernel_nofault(buf, __start, tsz);
+ if (m->type == KCORE_RAM)
+ kc_unxlate_dev_mem_ptr(phys, __start);
+ if (ret) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
+ ret = 0;
+ /*
+ * We know the bounce buffer is safe to copy from, so
+ * use _copy_to_iter() directly.
+ */
+ } else if (_copy_to_iter(buf, tsz, iter) != tsz) {
+ ret = -EFAULT;
+ goto out;
+ }
+ break;
+ default:
+ pr_warn_once("Unhandled KCORE type: %d\n", m->type);
+ if (iov_iter_zero(tsz, iter) != tsz) {
+ ret = -EFAULT;
+ goto out;
}
}
+skip:
buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
start += tsz;
tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);
}
out:
- up_read(&kclist_lock);
+ page_offline_thaw();
+ percpu_up_read(&kclist_lock);
if (ret)
return ret;
return orig_buflen - buflen;
@@ -575,7 +663,8 @@ static int release_kcore(struct inode *inode, struct file *file)
}
static const struct proc_ops kcore_proc_ops = {
- .proc_read = read_kcore,
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_read_iter = read_kcore_iter,
.proc_open = open_kcore,
.proc_release = release_kcore,
.proc_lseek = default_llseek,
@@ -594,10 +683,6 @@ static int __meminit kcore_callback(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block kcore_callback_nb __meminitdata = {
- .notifier_call = kcore_callback,
- .priority = 0,
-};
static struct kcore_list kcore_vmalloc;
@@ -650,7 +735,7 @@ static int __init proc_kcore_init(void)
add_modules_range();
/* Store direct-map area from physical memory map */
kcore_update_ram();
- register_hotmemory_notifier(&kcore_callback_nb);
+ hotplug_memory_notifier(kcore_callback, DEFAULT_CALLBACK_PRI);
return 0;
}
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index b38ad552887f..2fc92a13f9f8 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -15,11 +15,8 @@
#include <linux/fs.h>
#include <linux/syslog.h>
-#include <linux/uaccess.h>
#include <asm/io.h>
-extern wait_queue_head_t log_wait;
-
static int kmsg_open(struct inode * inode, struct file * file)
{
return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 8468baee951d..817981e57223 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -9,6 +9,7 @@
#include <linux/seq_file.h>
#include <linux/seqlock.h>
#include <linux/time.h>
+#include "internal.h"
static int loadavg_proc_show(struct seq_file *m, void *v)
{
@@ -16,7 +17,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
get_avenrun(avnrun, FIXED_1/200, 0);
- seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
+ seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %u/%d %d\n",
LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
@@ -27,7 +28,10 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
static int __init proc_loadavg_init(void)
{
- proc_create_single("loadavg", 0, NULL, loadavg_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("loadavg", 0, NULL, loadavg_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_loadavg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index e9a6841fc25b..a458f1e112fd 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -6,6 +6,7 @@
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/mmzone.h>
+#include <linux/memblock.h>
#include <linux/proc_fs.h>
#include <linux/percpu.h>
#include <linux/seq_file.h>
@@ -16,6 +17,7 @@
#ifdef CONFIG_CMA
#include <linux/cma.h>
#endif
+#include <linux/zswap.h>
#include <asm/page.h>
#include "internal.h"
@@ -41,7 +43,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
si_meminfo(&i);
si_swapinfo(&i);
- committed = percpu_counter_read_positive(&vm_committed_as);
+ committed = vm_memory_committed();
cached = global_node_page_state(NR_FILE_PAGES) -
total_swapcache_pages() - i.bufferram;
@@ -52,8 +54,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
available = si_mem_available();
- sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE);
- sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE);
+ sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B);
+ sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B);
show_val_kb(m, "MemTotal: ", i.totalram);
show_val_kb(m, "MemFree: ", i.freeram);
@@ -86,6 +88,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "SwapTotal: ", i.totalswap);
show_val_kb(m, "SwapFree: ", i.freeswap);
+#ifdef CONFIG_ZSWAP
+ show_val_kb(m, "Zswap: ", zswap_total_pages());
+ seq_printf(m, "Zswapped: %8lu kB\n",
+ (unsigned long)atomic_long_read(&zswap_stored_pages) <<
+ (PAGE_SHIFT - 10));
+#endif
show_val_kb(m, "Dirty: ",
global_node_page_state(NR_FILE_DIRTY));
show_val_kb(m, "Writeback: ",
@@ -101,19 +109,19 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "SReclaimable: ", sreclaimable);
show_val_kb(m, "SUnreclaim: ", sunreclaim);
seq_printf(m, "KernelStack: %8lu kB\n",
- global_zone_page_state(NR_KERNEL_STACK_KB));
+ global_node_page_state(NR_KERNEL_STACK_KB));
#ifdef CONFIG_SHADOW_CALL_STACK
seq_printf(m, "ShadowCallStack:%8lu kB\n",
- global_zone_page_state(NR_KERNEL_SCS_KB));
+ global_node_page_state(NR_KERNEL_SCS_KB));
#endif
show_val_kb(m, "PageTables: ",
- global_zone_page_state(NR_PAGETABLE));
+ global_node_page_state(NR_PAGETABLE));
+ show_val_kb(m, "SecPageTables: ",
+ global_node_page_state(NR_SECONDARY_PAGETABLE));
show_val_kb(m, "NFS_Unstable: ", 0);
- show_val_kb(m, "Bounce: ",
- global_zone_page_state(NR_BOUNCE));
- show_val_kb(m, "WritebackTmp: ",
- global_node_page_state(NR_WRITEBACK_TEMP));
+ show_val_kb(m, "Bounce: ", 0);
+ show_val_kb(m, "WritebackTmp: ", 0);
show_val_kb(m, "CommitLimit: ", vm_commit_limit());
show_val_kb(m, "Committed_AS: ", committed);
seq_printf(m, "VmallocTotal: %8lu kB\n",
@@ -122,6 +130,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "VmallocChunk: ", 0ul);
show_val_kb(m, "Percpu: ", pcpu_nr_pages());
+ memtest_report_meminfo(m);
+
#ifdef CONFIG_MEMORY_FAILURE
seq_printf(m, "HardwareCorrupted: %5lu kB\n",
atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10));
@@ -129,15 +139,15 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
show_val_kb(m, "AnonHugePages: ",
- global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR);
+ global_node_page_state(NR_ANON_THPS));
show_val_kb(m, "ShmemHugePages: ",
- global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
+ global_node_page_state(NR_SHMEM_THPS));
show_val_kb(m, "ShmemPmdMapped: ",
- global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
+ global_node_page_state(NR_SHMEM_PMDMAPPED));
show_val_kb(m, "FileHugePages: ",
- global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR);
+ global_node_page_state(NR_FILE_THPS));
show_val_kb(m, "FilePmdMapped: ",
- global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR);
+ global_node_page_state(NR_FILE_PMDMAPPED));
#endif
#ifdef CONFIG_CMA
@@ -146,6 +156,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
global_zone_page_state(NR_FREE_CMA_PAGES));
#endif
+#ifdef CONFIG_UNACCEPTED_MEMORY
+ show_val_kb(m, "Unaccepted: ",
+ global_zone_page_state(NR_UNACCEPTED));
+#endif
+ show_val_kb(m, "Balloon: ",
+ global_node_page_state(NR_BALLOON_PAGES));
+
hugetlb_report_meminfo(m);
arch_report_meminfo(m);
@@ -155,7 +172,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
static int __init proc_meminfo_init(void)
{
- proc_create_single("meminfo", 0, NULL, meminfo_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("meminfo", 0, NULL, meminfo_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_meminfo_init);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 8e159fc78c0a..ea2b597fd92c 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -12,7 +12,7 @@
#include "internal.h"
-static const struct proc_ns_operations *ns_entries[] = {
+static const struct proc_ns_operations *const ns_entries[] = {
#ifdef CONFIG_NET_NS
&netns_operations,
#endif
@@ -83,7 +83,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
res = ns_get_name(name, sizeof(name), task, ns_ops);
if (res >= 0)
- res = readlink_copy(buffer, buflen, name);
+ res = readlink_copy(buffer, buflen, name, strlen(name));
}
put_task_struct(task);
return res;
@@ -111,14 +111,13 @@ static struct dentry *proc_ns_instantiate(struct dentry *dentry,
ei->ns_ops = ns_ops;
pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
{
struct task_struct *task = get_proc_task(file_inode(file));
- const struct proc_ns_operations **entry, **last;
+ const struct proc_ns_operations *const *entry, *const *last;
if (!task)
return -ENOENT;
@@ -152,7 +151,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
struct dentry *dentry, unsigned int flags)
{
struct task_struct *task = get_proc_task(dir);
- const struct proc_ns_operations **entry, **last;
+ const struct proc_ns_operations *const *entry, *const *last;
unsigned int len = dentry->d_name.len;
struct dentry *res = ERR_PTR(-ENOENT);
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 13452b32e2bd..c6e7ebc63756 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -21,7 +21,6 @@
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
-#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/div64.h>
#include "internal.h"
@@ -59,7 +58,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
if (file) {
seq_pad(m, ' ');
- seq_file_path(m, file, "");
+ seq_path(m, file_user_path(file), "");
}
seq_putc(m, '\n');
diff --git a/fs/proc/page.c b/fs/proc/page.c
index f909243d4a66..f9b2c2c906cd 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -10,6 +10,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
+#include <linux/memremap.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
@@ -19,7 +20,12 @@
#define KPMSIZE sizeof(u64)
#define KPMMASK (KPMSIZE - 1)
-#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
+
+enum kpage_operation {
+ KPAGE_FLAGS,
+ KPAGE_COUNT,
+ KPAGE_CGROUP,
+};
static inline unsigned long get_max_dump_pfn(void)
{
@@ -36,21 +42,33 @@ static inline unsigned long get_max_dump_pfn(void)
#endif
}
-/* /proc/kpagecount - an array exposing page counts
- *
- * Each entry is a u64 representing the corresponding
- * physical page count.
- */
-static ssize_t kpagecount_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+static u64 get_kpage_count(const struct page *page)
+{
+ struct page_snapshot ps;
+ u64 ret;
+
+ snapshot_page(&ps, page);
+
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ ret = folio_precise_page_mapcount(&ps.folio_snapshot,
+ &ps.page_snapshot);
+ else
+ ret = folio_average_page_mapcount(&ps.folio_snapshot);
+
+ return ret;
+}
+
+static ssize_t kpage_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos,
+ enum kpage_operation op)
{
const unsigned long max_dump_pfn = get_max_dump_pfn();
u64 __user *out = (u64 __user *)buf;
- struct page *ppage;
+ struct page *page;
unsigned long src = *ppos;
unsigned long pfn;
ssize_t ret = 0;
- u64 pcount;
+ u64 info;
pfn = src / KPMSIZE;
if (src & KPMMASK || count & KPMMASK)
@@ -64,14 +82,27 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
* TODO: ZONE_DEVICE support requires to identify
* memmaps that were actually initialized.
*/
- ppage = pfn_to_online_page(pfn);
-
- if (!ppage || PageSlab(ppage) || page_has_type(ppage))
- pcount = 0;
- else
- pcount = page_mapcount(ppage);
-
- if (put_user(pcount, out)) {
+ page = pfn_to_online_page(pfn);
+
+ if (page) {
+ switch (op) {
+ case KPAGE_FLAGS:
+ info = stable_page_flags(page);
+ break;
+ case KPAGE_COUNT:
+ info = get_kpage_count(page);
+ break;
+ case KPAGE_CGROUP:
+ info = page_cgroup_ino(page);
+ break;
+ default:
+ info = 0;
+ break;
+ }
+ } else
+ info = 0;
+
+ if (put_user(info, out)) {
ret = -EFAULT;
break;
}
@@ -89,26 +120,37 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
return ret;
}
+/* /proc/kpagecount - an array exposing page mapcounts
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page mapcount.
+ */
+static ssize_t kpagecount_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return kpage_read(file, buf, count, ppos, KPAGE_COUNT);
+}
+
static const struct proc_ops kpagecount_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_lseek = mem_lseek,
.proc_read = kpagecount_read,
};
-/* /proc/kpageflags - an array exposing page flags
- *
- * Each entry is a u64 representing the corresponding
- * physical page flags.
- */
static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
{
return ((kflags >> kbit) & 1) << ubit;
}
-u64 stable_page_flags(struct page *page)
+u64 stable_page_flags(const struct page *page)
{
- u64 k;
- u64 u;
+ const struct folio *folio;
+ struct page_snapshot ps;
+ unsigned long k;
+ unsigned long mapping;
+ bool is_anon;
+ u64 u = 0;
/*
* pseudo flag: KPF_NOPAGE
@@ -117,76 +159,63 @@ u64 stable_page_flags(struct page *page)
if (!page)
return 1 << KPF_NOPAGE;
- k = page->flags;
- u = 0;
+ snapshot_page(&ps, page);
+ folio = &ps.folio_snapshot;
+
+ k = folio->flags.f;
+ mapping = (unsigned long)folio->mapping;
+ is_anon = mapping & FOLIO_MAPPING_ANON;
/*
* pseudo flags for the well known (anonymous) memory mapped pages
- *
- * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
- * simple test in page_mapped() is not enough.
*/
- if (!PageSlab(page) && page_mapped(page))
+ if (folio_mapped(folio))
u |= 1 << KPF_MMAP;
- if (PageAnon(page))
+ if (is_anon) {
u |= 1 << KPF_ANON;
- if (PageKsm(page))
- u |= 1 << KPF_KSM;
+ if (mapping & FOLIO_MAPPING_KSM)
+ u |= 1 << KPF_KSM;
+ }
/*
* compound pages: export both head/tail info
* they together define a compound page's start/end pos and order
*/
- if (PageHead(page))
- u |= 1 << KPF_COMPOUND_HEAD;
- if (PageTail(page))
+ if (ps.idx == 0)
+ u |= kpf_copy_bit(k, KPF_COMPOUND_HEAD, PG_head);
+ else
u |= 1 << KPF_COMPOUND_TAIL;
- if (PageHuge(page))
+ if (folio_test_hugetlb(folio))
u |= 1 << KPF_HUGE;
- /*
- * PageTransCompound can be true for non-huge compound pages (slab
- * pages or pages allocated by drivers with __GFP_COMP) because it
- * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
- * to make sure a given page is a thp, not a non-huge compound page.
- */
- else if (PageTransCompound(page)) {
- struct page *head = compound_head(page);
-
- if (PageLRU(head) || PageAnon(head))
- u |= 1 << KPF_THP;
- else if (is_huge_zero_page(head)) {
- u |= 1 << KPF_ZERO_PAGE;
- u |= 1 << KPF_THP;
- }
- } else if (is_zero_pfn(page_to_pfn(page)))
+ else if (folio_test_large(folio) &&
+ folio_test_large_rmappable(folio)) {
+ /* Note: we indicate any THPs here, not just PMD-sized ones */
+ u |= 1 << KPF_THP;
+ } else if (is_huge_zero_pfn(ps.pfn)) {
u |= 1 << KPF_ZERO_PAGE;
+ u |= 1 << KPF_THP;
+ } else if (is_zero_pfn(ps.pfn)) {
+ u |= 1 << KPF_ZERO_PAGE;
+ }
-
- /*
- * Caveats on high order pages: page->_refcount will only be set
- * -1 on the head page; SLUB/SLQB do the same for PG_slab;
- * SLOB won't set PG_slab at all on compound pages.
- */
- if (PageBuddy(page))
- u |= 1 << KPF_BUDDY;
- else if (page_count(page) == 0 && is_free_buddy_page(page))
+ if (ps.flags & PAGE_SNAPSHOT_PG_BUDDY)
u |= 1 << KPF_BUDDY;
- if (PageOffline(page))
+ if (folio_test_offline(folio))
u |= 1 << KPF_OFFLINE;
- if (PageTable(page))
+ if (folio_test_pgtable(folio))
u |= 1 << KPF_PGTABLE;
+ if (folio_test_slab(folio))
+ u |= 1 << KPF_SLAB;
- if (page_is_idle(page))
+#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
+ u |= kpf_copy_bit(k, KPF_IDLE, PG_idle);
+#else
+ if (ps.flags & PAGE_SNAPSHOT_PG_IDLE)
u |= 1 << KPF_IDLE;
+#endif
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
-
- u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
- if (PageTail(page) && PageSlab(compound_head(page)))
- u |= 1 << KPF_SLAB;
-
- u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate);
u |= kpf_copy_bit(k, KPF_WRITEBACK, PG_writeback);
@@ -196,7 +225,8 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active);
u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim);
- if (PageSwapCache(page))
+#define SWAPCACHE ((1 << PG_swapbacked) | (1 << PG_swapcache))
+ if ((k & SWAPCACHE) == SWAPCACHE)
u |= 1 << KPF_SWAPCACHE;
u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked);
@@ -204,120 +234,54 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked);
#ifdef CONFIG_MEMORY_FAILURE
- u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
-#endif
-
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
- u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached);
+ if (u & (1 << KPF_HUGE))
+ u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
+ else
+ u |= kpf_copy_bit(ps.page_snapshot.flags.f, KPF_HWPOISON, PG_hwpoison);
#endif
u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved);
- u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk);
+ u |= kpf_copy_bit(k, KPF_OWNER_2, PG_owner_2);
u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private);
u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2);
u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1);
+#ifdef CONFIG_ARCH_USES_PG_ARCH_2
+ u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2);
+#endif
+#ifdef CONFIG_ARCH_USES_PG_ARCH_3
+ u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3);
+#endif
return u;
-};
+}
+EXPORT_SYMBOL_GPL(stable_page_flags);
+/* /proc/kpageflags - an array exposing page flags
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page flags.
+ */
static ssize_t kpageflags_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+ size_t count, loff_t *ppos)
{
- const unsigned long max_dump_pfn = get_max_dump_pfn();
- u64 __user *out = (u64 __user *)buf;
- struct page *ppage;
- unsigned long src = *ppos;
- unsigned long pfn;
- ssize_t ret = 0;
-
- pfn = src / KPMSIZE;
- if (src & KPMMASK || count & KPMMASK)
- return -EINVAL;
- if (src >= max_dump_pfn * KPMSIZE)
- return 0;
- count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
-
- while (count > 0) {
- /*
- * TODO: ZONE_DEVICE support requires to identify
- * memmaps that were actually initialized.
- */
- ppage = pfn_to_online_page(pfn);
-
- if (put_user(stable_page_flags(ppage), out)) {
- ret = -EFAULT;
- break;
- }
-
- pfn++;
- out++;
- count -= KPMSIZE;
-
- cond_resched();
- }
-
- *ppos += (char __user *)out - buf;
- if (!ret)
- ret = (char __user *)out - buf;
- return ret;
+ return kpage_read(file, buf, count, ppos, KPAGE_FLAGS);
}
static const struct proc_ops kpageflags_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_lseek = mem_lseek,
.proc_read = kpageflags_read,
};
#ifdef CONFIG_MEMCG
static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+ size_t count, loff_t *ppos)
{
- const unsigned long max_dump_pfn = get_max_dump_pfn();
- u64 __user *out = (u64 __user *)buf;
- struct page *ppage;
- unsigned long src = *ppos;
- unsigned long pfn;
- ssize_t ret = 0;
- u64 ino;
-
- pfn = src / KPMSIZE;
- if (src & KPMMASK || count & KPMMASK)
- return -EINVAL;
- if (src >= max_dump_pfn * KPMSIZE)
- return 0;
- count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
-
- while (count > 0) {
- /*
- * TODO: ZONE_DEVICE support requires to identify
- * memmaps that were actually initialized.
- */
- ppage = pfn_to_online_page(pfn);
-
- if (ppage)
- ino = page_cgroup_ino(ppage);
- else
- ino = 0;
-
- if (put_user(ino, out)) {
- ret = -EFAULT;
- break;
- }
-
- pfn++;
- out++;
- count -= KPMSIZE;
-
- cond_resched();
- }
-
- *ppos += (char __user *)out - buf;
- if (!ret)
- ret = (char __user *)out - buf;
- return ret;
+ return kpage_read(file, buf, count, ppos, KPAGE_CGROUP);
}
-
static const struct proc_ops kpagecgroup_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_lseek = mem_lseek,
.proc_read = kpagecgroup_read,
};
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index dba63b2429f0..52f0b75cbce2 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -8,9 +8,6 @@
*
* proc net directory handling functions
*/
-
-#include <linux/uaccess.h>
-
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
@@ -39,22 +36,6 @@ static struct net *get_proc_net(const struct inode *inode)
return maybe_get_net(PDE_NET(PDE(inode)));
}
-static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags)
-{
- return 0;
-}
-
-static const struct dentry_operations proc_net_dentry_ops = {
- .d_revalidate = proc_net_d_revalidate,
- .d_delete = always_delete_dentry,
-};
-
-static void pde_force_lookup(struct proc_dir_entry *pde)
-{
- /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
- pde->proc_dops = &proc_net_dentry_ops;
-}
-
static int seq_open_net(struct inode *inode, struct file *file)
{
unsigned int state_size = PDE(inode)->state_size;
@@ -77,15 +58,27 @@ static int seq_open_net(struct inode *inode, struct file *file)
}
#ifdef CONFIG_NET_NS
p->net = net;
+ netns_tracker_alloc(net, &p->ns_tracker, GFP_KERNEL);
#endif
return 0;
}
+static void seq_file_net_put_net(struct seq_file *seq)
+{
+#ifdef CONFIG_NET_NS
+ struct seq_net_private *priv = seq->private;
+
+ put_net_track(priv->net, &priv->ns_tracker);
+#else
+ put_net(&init_net);
+#endif
+}
+
static int seq_release_net(struct inode *ino, struct file *f)
{
struct seq_file *seq = f->private_data;
- put_net(seq_file_net(seq));
+ seq_file_net_put_net(seq);
seq_release_private(ino, f);
return 0;
}
@@ -98,12 +91,13 @@ static const struct proc_ops proc_net_seq_ops = {
.proc_release = seq_release_net,
};
-int bpf_iter_init_seq_net(void *priv_data)
+int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux)
{
#ifdef CONFIG_NET_NS
struct seq_net_private *p = priv_data;
- p->net = get_net(current->nsproxy->net_ns);
+ p->net = get_net_track(current->nsproxy->net_ns, &p->ns_tracker,
+ GFP_KERNEL);
#endif
return 0;
}
@@ -113,7 +107,7 @@ void bpf_iter_fini_seq_net(void *priv_data)
#ifdef CONFIG_NET_NS
struct seq_net_private *p = priv_data;
- put_net(p->net);
+ put_net_track(p->net, &p->ns_tracker);
#endif
}
@@ -140,8 +134,9 @@ EXPORT_SYMBOL_GPL(proc_create_net_data);
* @mode: The file's access mode.
* @parent: The parent directory in which to create.
* @ops: The seq_file ops with which to read the file.
- * @write: The write method which which to 'modify' the file.
- * @data: Data for retrieval by PDE_DATA().
+ * @write: The write method with which to 'modify' the file.
+ * @state_size: The size of the per-file private state to allocate.
+ * @data: Data for retrieval by pde_data().
*
* Create a network namespaced proc file in the @parent directory with the
* specified @name and @mode that allows reading of a file that displays a
@@ -156,7 +151,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_data);
* modified by the @write function. @write should return 0 on success.
*
* The @data value is accessible from the @show and @write functions by calling
- * PDE_DATA() on the file inode. The network namespace must be accessed by
+ * pde_data() on the file inode. The network namespace must be accessed by
* calling seq_file_net() on the seq_file struct.
*/
struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode,
@@ -232,8 +227,8 @@ EXPORT_SYMBOL_GPL(proc_create_net_single);
* @mode: The file's access mode.
* @parent: The parent directory in which to create.
* @show: The seqfile show method with which to read the file.
- * @write: The write method which which to 'modify' the file.
- * @data: Data for retrieval by PDE_DATA().
+ * @write: The write method with which to 'modify' the file.
+ * @data: Data for retrieval by pde_data().
*
* Create a network-namespaced proc file in the @parent directory with the
* specified @name and @mode that allows reading of a file that displays a
@@ -248,7 +243,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_single);
* modified by the @write function. @write should return 0 on success.
*
* The @data value is accessible from the @show and @write functions by calling
- * PDE_DATA() on the file inode. The network namespace must be accessed by
+ * pde_data() on the file inode. The network namespace must be accessed by
* calling seq_file_single_net() on the seq_file struct.
*/
struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode,
@@ -305,7 +300,8 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir,
return de;
}
-static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat,
+static int proc_tgid_net_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
@@ -313,7 +309,7 @@ static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat,
net = get_proc_task_net(inode);
- generic_fillattr(inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (net != NULL) {
stat->nlink = net->proc_net->nlink;
@@ -326,6 +322,7 @@ static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat,
const struct inode_operations proc_net_inode_operations = {
.lookup = proc_tgid_net_lookup,
.getattr = proc_tgid_net_getattr,
+ .setattr = proc_setattr,
};
static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
@@ -355,6 +352,12 @@ static __net_init int proc_net_ns_init(struct net *net)
kgid_t gid;
int err;
+ /*
+ * This PDE acts only as an anchor for /proc/${pid}/net hierarchy.
+ * Corresponding inode (PDE(inode) == net->proc_net) is never
+ * instantiated therefore blanket zeroing is fine.
+ * net->proc_net_stat inode is instantiated normally.
+ */
err = -ENOMEM;
netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
if (!netd)
@@ -378,6 +381,9 @@ static __net_init int proc_net_ns_init(struct net *net)
proc_set_user(netd, uid, gid);
+ /* Seed dentry revalidation for /proc/${pid}/net */
+ pde_force_lookup(netd);
+
err = -EEXIST;
net_statd = proc_net_mkdir(net, "stat", netd);
if (!net_statd)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 42c5128c7d1c..49ab74e0bfde 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -12,42 +12,52 @@
#include <linux/cred.h>
#include <linux/namei.h>
#include <linux/mm.h>
+#include <linux/uio.h>
#include <linux/module.h>
#include <linux/bpf-cgroup.h>
#include <linux/mount.h>
+#include <linux/kmemleak.h>
+#include <linux/lockdep.h>
#include "internal.h"
+#define list_for_each_table_entry(entry, header) \
+ entry = header->ctl_table; \
+ for (size_t i = 0 ; i < header->ctl_table_size; ++i, entry++)
+
static const struct dentry_operations proc_sys_dentry_operations;
static const struct file_operations proc_sys_file_operations;
static const struct inode_operations proc_sys_inode_operations;
static const struct file_operations proc_sys_dir_file_operations;
static const struct inode_operations proc_sys_dir_operations;
-/* shared constants to be used in various sysctls */
-const int sysctl_vals[] = { 0, 1, INT_MAX };
-EXPORT_SYMBOL(sysctl_vals);
-
-/* Support for permanently empty directories */
-
-struct ctl_table sysctl_mount_point[] = {
+/*
+ * Support for permanently empty directories.
+ * Must be non-empty to avoid sharing an address with other tables.
+ */
+static const struct ctl_table sysctl_mount_point[] = {
{ }
};
-static bool is_empty_dir(struct ctl_table_header *head)
-{
- return head->ctl_table[0].child == sysctl_mount_point;
-}
-
-static void set_empty_dir(struct ctl_dir *dir)
+/**
+ * register_sysctl_mount_point() - registers a sysctl mount point
+ * @path: path for the mount point
+ *
+ * Used to create a permanently empty directory to serve as mount point.
+ * There are some subtle but important permission checks this allows in the
+ * case of unprivileged mounts.
+ */
+struct ctl_table_header *register_sysctl_mount_point(const char *path)
{
- dir->header.ctl_table[0].child = sysctl_mount_point;
+ return register_sysctl_sz(path, sysctl_mount_point, 0);
}
+EXPORT_SYMBOL(register_sysctl_mount_point);
-static void clear_empty_dir(struct ctl_dir *dir)
-
-{
- dir->header.ctl_table[0].child = NULL;
-}
+#define sysctl_is_perm_empty_ctl_header(hptr) \
+ (hptr->type == SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY)
+#define sysctl_set_perm_empty_ctl_header(hptr) \
+ (hptr->type = SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY)
+#define sysctl_clear_perm_empty_ctl_header(hptr) \
+ (hptr->type = SYSCTL_TABLE_TYPE_DEFAULT)
void proc_sys_poll_notify(struct ctl_table_poll *poll)
{
@@ -58,12 +68,11 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll)
wake_up_interruptible(&poll->wait);
}
-static struct ctl_table root_table[] = {
+static const struct ctl_table root_table[] = {
{
.procname = "",
.mode = S_IFDIR|S_IRUGO|S_IXUGO,
},
- { }
};
static struct ctl_table_root sysctl_table_root = {
.default_set.dir.header = {
@@ -80,7 +89,7 @@ static DEFINE_SPINLOCK(sysctl_lock);
static void drop_sysctl_table(struct ctl_table_header *header);
static int sysctl_follow_link(struct ctl_table_header **phead,
- struct ctl_table **pentry);
+ const struct ctl_table **pentry);
static int insert_links(struct ctl_table_header *head);
static void put_links(struct ctl_table_header *header);
@@ -93,27 +102,23 @@ static void sysctl_print_dir(struct ctl_dir *dir)
static int namecmp(const char *name1, int len1, const char *name2, int len2)
{
- int minlen;
int cmp;
- minlen = len1;
- if (minlen > len2)
- minlen = len2;
-
- cmp = memcmp(name1, name2, minlen);
+ cmp = memcmp(name1, name2, min(len1, len2));
if (cmp == 0)
cmp = len1 - len2;
return cmp;
}
-/* Called under sysctl_lock */
-static struct ctl_table *find_entry(struct ctl_table_header **phead,
+static const struct ctl_table *find_entry(struct ctl_table_header **phead,
struct ctl_dir *dir, const char *name, int namelen)
{
struct ctl_table_header *head;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
struct rb_node *node = dir->root.rb_node;
+ lockdep_assert_held(&sysctl_lock);
+
while (node)
{
struct ctl_node *ctl_node;
@@ -138,7 +143,7 @@ static struct ctl_table *find_entry(struct ctl_table_header **phead,
return NULL;
}
-static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
+static int insert_entry(struct ctl_table_header *head, const struct ctl_table *entry)
{
struct rb_node *node = &head->node[entry - head->ctl_table].node;
struct rb_node **p = &head->parent->root.rb_node;
@@ -148,7 +153,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
while (*p) {
struct ctl_table_header *parent_head;
- struct ctl_table *parent_entry;
+ const struct ctl_table *parent_entry;
struct ctl_node *parent_node;
const char *parent_name;
int cmp;
@@ -167,7 +172,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
else {
pr_err("sysctl duplicate entry: ");
sysctl_print_dir(head->parent);
- pr_cont("/%s\n", entry->procname);
+ pr_cont("%s\n", entry->procname);
return -EEXIST;
}
}
@@ -177,7 +182,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
return 0;
}
-static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
+static void erase_entry(struct ctl_table_header *head, const struct ctl_table *entry)
{
struct rb_node *node = &head->node[entry - head->ctl_table].node;
@@ -186,9 +191,10 @@ static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
static void init_header(struct ctl_table_header *head,
struct ctl_table_root *root, struct ctl_table_set *set,
- struct ctl_node *node, struct ctl_table *table)
+ struct ctl_node *node, const struct ctl_table *table, size_t table_size)
{
head->ctl_table = table;
+ head->ctl_table_size = table_size;
head->ctl_table_arg = table;
head->used = 0;
head->count = 1;
@@ -200,41 +206,49 @@ static void init_header(struct ctl_table_header *head,
head->node = node;
INIT_HLIST_HEAD(&head->inodes);
if (node) {
- struct ctl_table *entry;
- for (entry = table; entry->procname; entry++, node++)
+ const struct ctl_table *entry;
+
+ list_for_each_table_entry(entry, head) {
node->header = head;
+ node++;
+ }
}
+ if (table == sysctl_mount_point)
+ sysctl_set_perm_empty_ctl_header(head);
}
static void erase_header(struct ctl_table_header *head)
{
- struct ctl_table *entry;
- for (entry = head->ctl_table; entry->procname; entry++)
+ const struct ctl_table *entry;
+
+ list_for_each_table_entry(entry, head)
erase_entry(head, entry);
}
static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
{
- struct ctl_table *entry;
+ const struct ctl_table *entry;
+ struct ctl_table_header *dir_h = &dir->header;
int err;
+
/* Is this a permanently empty directory? */
- if (is_empty_dir(&dir->header))
+ if (sysctl_is_perm_empty_ctl_header(dir_h))
return -EROFS;
/* Am I creating a permanently empty directory? */
- if (header->ctl_table == sysctl_mount_point) {
+ if (sysctl_is_perm_empty_ctl_header(header)) {
if (!RB_EMPTY_ROOT(&dir->root))
return -EINVAL;
- set_empty_dir(dir);
+ sysctl_set_perm_empty_ctl_header(dir_h);
}
- dir->header.nreg++;
+ dir_h->nreg++;
header->parent = dir;
err = insert_links(header);
if (err)
goto fail_links;
- for (entry = header->ctl_table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, header) {
err = insert_entry(header, entry);
if (err)
goto fail;
@@ -245,24 +259,26 @@ fail:
put_links(header);
fail_links:
if (header->ctl_table == sysctl_mount_point)
- clear_empty_dir(dir);
+ sysctl_clear_perm_empty_ctl_header(dir_h);
header->parent = NULL;
- drop_sysctl_table(&dir->header);
+ drop_sysctl_table(dir_h);
return err;
}
-/* called under sysctl_lock */
static int use_table(struct ctl_table_header *p)
{
+ lockdep_assert_held(&sysctl_lock);
+
if (unlikely(p->unregistering))
return 0;
p->used++;
return 1;
}
-/* called under sysctl_lock */
static void unuse_table(struct ctl_table_header *p)
{
+ lockdep_assert_held(&sysctl_lock);
+
if (!--p->used)
if (unlikely(p->unregistering))
complete(p->unregistering);
@@ -273,9 +289,11 @@ static void proc_sys_invalidate_dcache(struct ctl_table_header *head)
proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock);
}
-/* called under sysctl_lock, will reacquire if has to wait */
static void start_unregistering(struct ctl_table_header *p)
{
+ /* will reacquire if has to wait */
+ lockdep_assert_held(&sysctl_lock);
+
/*
* if p->used is 0, nobody will ever touch that entry again;
* we'll eliminate all paths to it before dropping sysctl_lock
@@ -332,12 +350,12 @@ lookup_header_set(struct ctl_table_root *root)
return set;
}
-static struct ctl_table *lookup_entry(struct ctl_table_header **phead,
- struct ctl_dir *dir,
- const char *name, int namelen)
+static const struct ctl_table *lookup_entry(struct ctl_table_header **phead,
+ struct ctl_dir *dir,
+ const char *name, int namelen)
{
struct ctl_table_header *head;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
spin_lock(&sysctl_lock);
entry = find_entry(&head, dir, name, namelen);
@@ -362,10 +380,10 @@ static struct ctl_node *first_usable_entry(struct rb_node *node)
}
static void first_entry(struct ctl_dir *dir,
- struct ctl_table_header **phead, struct ctl_table **pentry)
+ struct ctl_table_header **phead, const struct ctl_table **pentry)
{
struct ctl_table_header *head = NULL;
- struct ctl_table *entry = NULL;
+ const struct ctl_table *entry = NULL;
struct ctl_node *ctl_node;
spin_lock(&sysctl_lock);
@@ -379,10 +397,10 @@ static void first_entry(struct ctl_dir *dir,
*pentry = entry;
}
-static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry)
+static void next_entry(struct ctl_table_header **phead, const struct ctl_table **pentry)
{
struct ctl_table_header *head = *phead;
- struct ctl_table *entry = *pentry;
+ const struct ctl_table *entry = *pentry;
struct ctl_node *ctl_node = &head->node[entry - head->ctl_table];
spin_lock(&sysctl_lock);
@@ -415,7 +433,7 @@ static int test_perm(int mode, int op)
return -EACCES;
}
-static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op)
+static int sysctl_perm(struct ctl_table_header *head, const struct ctl_table *table, int op)
{
struct ctl_table_root *root = head->root;
int mode;
@@ -429,7 +447,7 @@ static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, i
}
static struct inode *proc_sys_make_inode(struct super_block *sb,
- struct ctl_table_header *head, struct ctl_table *table)
+ struct ctl_table_header *head, const struct ctl_table *table)
{
struct ctl_table_root *root = head->root;
struct inode *inode;
@@ -455,7 +473,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
head->count++;
spin_unlock(&sysctl_lock);
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ simple_inode_init_ts(inode);
inode->i_mode = table->mode;
if (!S_ISDIR(table->mode)) {
inode->i_mode |= S_IFREG;
@@ -465,16 +483,14 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
inode->i_mode |= S_IFDIR;
inode->i_op = &proc_sys_dir_operations;
inode->i_fop = &proc_sys_dir_file_operations;
- if (is_empty_dir(head))
+ if (sysctl_is_perm_empty_ctl_header(head))
make_empty_dir_inode(inode);
}
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
if (root->set_ownership)
- root->set_ownership(head, table, &inode->i_uid, &inode->i_gid);
- else {
- inode->i_uid = GLOBAL_ROOT_UID;
- inode->i_gid = GLOBAL_ROOT_GID;
- }
+ root->set_ownership(head, &inode->i_uid, &inode->i_gid);
return inode;
}
@@ -502,7 +518,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
struct ctl_table_header *head = grab_header(dir);
struct ctl_table_header *h = NULL;
const struct qstr *name = &dentry->d_name;
- struct ctl_table *p;
+ const struct ctl_table *p;
struct inode *inode;
struct dentry *err = ERR_PTR(-ENOENT);
struct ctl_dir *ctl_dir;
@@ -525,13 +541,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
}
inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
- if (IS_ERR(inode)) {
- err = ERR_CAST(inode);
- goto out;
- }
-
- d_set_d_op(dentry, &proc_sys_dentry_operations);
- err = d_splice_alias(inode, dentry);
+ err = d_splice_alias_ops(inode, dentry, &proc_sys_dentry_operations);
out:
if (h)
@@ -540,13 +550,14 @@ out:
return err;
}
-static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
- size_t count, loff_t *ppos, int write)
+static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
+ int write)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(iocb->ki_filp);
struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
- void *kbuf;
+ const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ size_t count = iov_iter_count(iter);
+ char *kbuf;
ssize_t error;
if (IS_ERR(head))
@@ -566,63 +577,59 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
goto out;
/* don't even try if the size is too large */
- if (count > KMALLOC_MAX_SIZE)
- return -ENOMEM;
+ error = -ENOMEM;
+ if (count >= KMALLOC_MAX_SIZE)
+ goto out;
+ kbuf = kvzalloc(count + 1, GFP_KERNEL);
+ if (!kbuf)
+ goto out;
if (write) {
- kbuf = memdup_user_nul(ubuf, count);
- if (IS_ERR(kbuf)) {
- error = PTR_ERR(kbuf);
- goto out;
- }
- } else {
- error = -ENOMEM;
- kbuf = kzalloc(count, GFP_KERNEL);
- if (!kbuf)
- goto out;
+ error = -EFAULT;
+ if (!copy_from_iter_full(kbuf, count, iter))
+ goto out_free_buf;
+ kbuf[count] = '\0';
}
error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count,
- ppos);
+ &iocb->ki_pos);
if (error)
goto out_free_buf;
/* careful: calling conventions are nasty here */
- error = table->proc_handler(table, write, kbuf, &count, ppos);
+ error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos);
if (error)
goto out_free_buf;
if (!write) {
error = -EFAULT;
- if (copy_to_user(ubuf, kbuf, count))
+ if (copy_to_iter(kbuf, count, iter) < count)
goto out_free_buf;
}
error = count;
out_free_buf:
- kfree(kbuf);
+ kvfree(kbuf);
out:
sysctl_head_finish(head);
return error;
}
-static ssize_t proc_sys_read(struct file *filp, char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter)
{
- return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0);
+ return proc_sys_call_handler(iocb, iter, 0);
}
-static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter)
{
- return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
+ return proc_sys_call_handler(iocb, iter, 1);
}
static int proc_sys_open(struct inode *inode, struct file *filp)
{
struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
/* sysctl was unregistered */
if (IS_ERR(head))
@@ -640,7 +647,7 @@ static __poll_t proc_sys_poll(struct file *filp, poll_table *wait)
{
struct inode *inode = file_inode(filp);
struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
__poll_t ret = DEFAULT_POLLMASK;
unsigned long event;
@@ -671,7 +678,7 @@ out:
static bool proc_sys_fill_cache(struct file *file,
struct dir_context *ctx,
struct ctl_table_header *head,
- struct ctl_table *table)
+ const struct ctl_table *table)
{
struct dentry *child, *dir = file->f_path.dentry;
struct inode *inode;
@@ -692,20 +699,15 @@ static bool proc_sys_fill_cache(struct file *file,
if (d_in_lookup(child)) {
struct dentry *res;
inode = proc_sys_make_inode(dir->d_sb, head, table);
- if (IS_ERR(inode)) {
- d_lookup_done(child);
- dput(child);
- return false;
- }
- d_set_d_op(child, &proc_sys_dentry_operations);
- res = d_splice_alias(inode, child);
+ res = d_splice_alias_ops(inode, child,
+ &proc_sys_dentry_operations);
d_lookup_done(child);
if (unlikely(res)) {
- if (IS_ERR(res)) {
- dput(child);
- return false;
- }
dput(child);
+
+ if (IS_ERR(res))
+ return false;
+
child = res;
}
}
@@ -720,7 +722,7 @@ static bool proc_sys_fill_cache(struct file *file,
static bool proc_sys_link_fill_cache(struct file *file,
struct dir_context *ctx,
struct ctl_table_header *head,
- struct ctl_table *table)
+ const struct ctl_table *table)
{
bool ret = true;
@@ -738,7 +740,7 @@ out:
return ret;
}
-static int scan(struct ctl_table_header *head, struct ctl_table *table,
+static int scan(struct ctl_table_header *head, const struct ctl_table *table,
unsigned long *pos, struct file *file,
struct dir_context *ctx)
{
@@ -762,7 +764,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
{
struct ctl_table_header *head = grab_header(file_inode(file));
struct ctl_table_header *h = NULL;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
struct ctl_dir *ctl_dir;
unsigned long pos;
@@ -787,14 +789,15 @@ out:
return 0;
}
-static int proc_sys_permission(struct inode *inode, int mask)
+static int proc_sys_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask)
{
/*
* sysctl entries that are not writeable,
* are _NOT_ writeable, capabilities or not.
*/
struct ctl_table_header *head;
- struct ctl_table *table;
+ const struct ctl_table *table;
int error;
/* Executable files are not allowed under /proc/sys/ */
@@ -815,7 +818,8 @@ static int proc_sys_permission(struct inode *inode, int mask)
return error;
}
-static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
+static int proc_sys_setattr(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
int error;
@@ -823,26 +827,26 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
return -EPERM;
- error = setattr_prepare(dentry, attr);
+ error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
if (error)
return error;
- setattr_copy(inode, attr);
- mark_inode_dirty(inode);
+ setattr_copy(&nop_mnt_idmap, inode, attr);
return 0;
}
-static int proc_sys_getattr(const struct path *path, struct kstat *stat,
+static int proc_sys_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
if (IS_ERR(head))
return PTR_ERR(head);
- generic_fillattr(inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (table)
stat->mode = (stat->mode & S_IFMT) | table->mode;
@@ -853,8 +857,10 @@ static int proc_sys_getattr(const struct path *path, struct kstat *stat,
static const struct file_operations proc_sys_file_operations = {
.open = proc_sys_open,
.poll = proc_sys_poll,
- .read = proc_sys_read,
- .write = proc_sys_write,
+ .read_iter = proc_sys_read,
+ .write_iter = proc_sys_write,
+ .splice_read = copy_splice_read,
+ .splice_write = iter_file_splice_write,
.llseek = default_llseek,
};
@@ -877,7 +883,8 @@ static const struct inode_operations proc_sys_dir_operations = {
.getattr = proc_sys_getattr,
};
-static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags)
+static int proc_sys_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -910,17 +917,21 @@ static int proc_sys_compare(const struct dentry *dentry,
struct ctl_table_header *head;
struct inode *inode;
- /* Although proc doesn't have negative dentries, rcu-walk means
- * that inode here can be NULL */
- /* AV: can it, indeed? */
- inode = d_inode_rcu(dentry);
- if (!inode)
- return 1;
if (name->len != len)
return 1;
if (memcmp(name->name, str, len))
return 1;
- head = rcu_dereference(PROC_I(inode)->sysctl);
+
+ // false positive is fine here - we'll recheck anyway
+ if (d_in_lookup(dentry))
+ return 0;
+
+ inode = d_inode_rcu(dentry);
+ // we just might have run into dentry in the middle of __dentry_kill()
+ if (!inode)
+ return 1;
+
+ head = READ_ONCE(PROC_I(inode)->sysctl);
return !head || !sysctl_is_seen(head);
}
@@ -934,7 +945,7 @@ static struct ctl_dir *find_subdir(struct ctl_dir *dir,
const char *name, int namelen)
{
struct ctl_table_header *head;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
entry = find_entry(&head, dir, name, namelen);
if (!entry)
@@ -953,19 +964,18 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set,
char *new_name;
new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) +
- sizeof(struct ctl_table)*2 + namelen + 1,
+ sizeof(struct ctl_table) + namelen + 1,
GFP_KERNEL);
if (!new)
return NULL;
node = (struct ctl_node *)(new + 1);
table = (struct ctl_table *)(node + 1);
- new_name = (char *)(table + 2);
+ new_name = (char *)(table + 1);
memcpy(new_name, name, namelen);
- new_name[namelen] = '\0';
table[0].procname = new_name;
table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
- init_header(&new->header, set->dir.header.root, set, node, table);
+ init_header(&new->header, set->dir.header.root, set, node, table, 1);
return new;
}
@@ -1022,8 +1032,8 @@ failed:
if (IS_ERR(subdir)) {
pr_err("sysctl could not get directory: ");
sysctl_print_dir(dir);
- pr_cont("/%*.*s %ld\n",
- namelen, namelen, name, PTR_ERR(subdir));
+ pr_cont("%*.*s %ld\n", namelen, namelen, name,
+ PTR_ERR(subdir));
}
drop_sysctl_table(&dir->header);
if (new)
@@ -1046,16 +1056,15 @@ static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
}
static int sysctl_follow_link(struct ctl_table_header **phead,
- struct ctl_table **pentry)
+ const struct ctl_table **pentry)
{
struct ctl_table_header *head;
+ const struct ctl_table *entry;
struct ctl_table_root *root;
struct ctl_table_set *set;
- struct ctl_table *entry;
struct ctl_dir *dir;
int ret;
- ret = 0;
spin_lock(&sysctl_lock);
root = (*pentry)->data;
set = lookup_header_set(root);
@@ -1079,7 +1088,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead,
return ret;
}
-static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
+static int sysctl_err(const char *path, const struct ctl_table *table, char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -1095,8 +1104,9 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
return -EINVAL;
}
-static int sysctl_check_table_array(const char *path, struct ctl_table *table)
+static int sysctl_check_table_array(const char *path, const struct ctl_table *table)
{
+ unsigned int extra;
int err = 0;
if ((table->proc_handler == proc_douintvec) ||
@@ -1105,62 +1115,85 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table)
err |= sysctl_err(path, table, "array not allowed");
}
+ if (table->proc_handler == proc_dou8vec_minmax) {
+ if (table->maxlen != sizeof(u8))
+ err |= sysctl_err(path, table, "array not allowed");
+
+ if (table->extra1) {
+ extra = *(unsigned int *) table->extra1;
+ if (extra > 255U)
+ err |= sysctl_err(path, table,
+ "range value too large for proc_dou8vec_minmax");
+ }
+ if (table->extra2) {
+ extra = *(unsigned int *) table->extra2;
+ if (extra > 255U)
+ err |= sysctl_err(path, table,
+ "range value too large for proc_dou8vec_minmax");
+ }
+ }
+
+ if (table->proc_handler == proc_dobool) {
+ if (table->maxlen != sizeof(bool))
+ err |= sysctl_err(path, table, "array not allowed");
+ }
+
return err;
}
-static int sysctl_check_table(const char *path, struct ctl_table *table)
+static int sysctl_check_table(const char *path, struct ctl_table_header *header)
{
+ const struct ctl_table *entry;
int err = 0;
- for (; table->procname; table++) {
- if (table->child)
- err |= sysctl_err(path, table, "Not a file");
-
- if ((table->proc_handler == proc_dostring) ||
- (table->proc_handler == proc_dointvec) ||
- (table->proc_handler == proc_douintvec) ||
- (table->proc_handler == proc_douintvec_minmax) ||
- (table->proc_handler == proc_dointvec_minmax) ||
- (table->proc_handler == proc_dointvec_jiffies) ||
- (table->proc_handler == proc_dointvec_userhz_jiffies) ||
- (table->proc_handler == proc_dointvec_ms_jiffies) ||
- (table->proc_handler == proc_doulongvec_minmax) ||
- (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
- if (!table->data)
- err |= sysctl_err(path, table, "No data");
- if (!table->maxlen)
- err |= sysctl_err(path, table, "No maxlen");
+ list_for_each_table_entry(entry, header) {
+ if (!entry->procname)
+ err |= sysctl_err(path, entry, "procname is null");
+ if ((entry->proc_handler == proc_dostring) ||
+ (entry->proc_handler == proc_dobool) ||
+ (entry->proc_handler == proc_dointvec) ||
+ (entry->proc_handler == proc_douintvec) ||
+ (entry->proc_handler == proc_douintvec_minmax) ||
+ (entry->proc_handler == proc_dointvec_minmax) ||
+ (entry->proc_handler == proc_dou8vec_minmax) ||
+ (entry->proc_handler == proc_dointvec_jiffies) ||
+ (entry->proc_handler == proc_dointvec_userhz_jiffies) ||
+ (entry->proc_handler == proc_dointvec_ms_jiffies) ||
+ (entry->proc_handler == proc_doulongvec_minmax) ||
+ (entry->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
+ if (!entry->data)
+ err |= sysctl_err(path, entry, "No data");
+ if (!entry->maxlen)
+ err |= sysctl_err(path, entry, "No maxlen");
else
- err |= sysctl_check_table_array(path, table);
+ err |= sysctl_check_table_array(path, entry);
}
- if (!table->proc_handler)
- err |= sysctl_err(path, table, "No proc_handler");
+ if (!entry->proc_handler)
+ err |= sysctl_err(path, entry, "No proc_handler");
- if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode)
- err |= sysctl_err(path, table, "bogus .mode 0%o",
- table->mode);
+ if ((entry->mode & (S_IRUGO|S_IWUGO)) != entry->mode)
+ err |= sysctl_err(path, entry, "bogus .mode 0%o",
+ entry->mode);
}
return err;
}
-static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table,
- struct ctl_table_root *link_root)
+static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_header *head)
{
- struct ctl_table *link_table, *entry, *link;
+ struct ctl_table *link_table, *link;
struct ctl_table_header *links;
+ const struct ctl_table *entry;
struct ctl_node *node;
char *link_name;
- int nr_entries, name_bytes;
+ int name_bytes;
name_bytes = 0;
- nr_entries = 0;
- for (entry = table; entry->procname; entry++) {
- nr_entries++;
+ list_for_each_table_entry(entry, head) {
name_bytes += strlen(entry->procname) + 1;
}
links = kzalloc(sizeof(struct ctl_table_header) +
- sizeof(struct ctl_node)*nr_entries +
- sizeof(struct ctl_table)*(nr_entries + 1) +
+ sizeof(struct ctl_node)*head->ctl_table_size +
+ sizeof(struct ctl_table)*head->ctl_table_size +
name_bytes,
GFP_KERNEL);
@@ -1168,33 +1201,41 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table
return NULL;
node = (struct ctl_node *)(links + 1);
- link_table = (struct ctl_table *)(node + nr_entries);
- link_name = (char *)&link_table[nr_entries + 1];
+ link_table = (struct ctl_table *)(node + head->ctl_table_size);
+ link_name = (char *)(link_table + head->ctl_table_size);
+ link = link_table;
- for (link = link_table, entry = table; entry->procname; link++, entry++) {
+ list_for_each_table_entry(entry, head) {
int len = strlen(entry->procname) + 1;
memcpy(link_name, entry->procname, len);
link->procname = link_name;
link->mode = S_IFLNK|S_IRWXUGO;
- link->data = link_root;
+ link->data = head->root;
link_name += len;
+ link++;
}
- init_header(links, dir->header.root, dir->header.set, node, link_table);
- links->nreg = nr_entries;
+ init_header(links, dir->header.root, dir->header.set, node, link_table,
+ head->ctl_table_size);
+ links->nreg = head->ctl_table_size;
return links;
}
static bool get_links(struct ctl_dir *dir,
- struct ctl_table *table, struct ctl_table_root *link_root)
+ struct ctl_table_header *header,
+ struct ctl_table_root *link_root)
{
- struct ctl_table_header *head;
- struct ctl_table *entry, *link;
+ struct ctl_table_header *tmp_head;
+ const struct ctl_table *entry, *link;
+
+ if (header->ctl_table_size == 0 ||
+ sysctl_is_perm_empty_ctl_header(header))
+ return true;
/* Are there links available for every entry in table? */
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, header) {
const char *procname = entry->procname;
- link = find_entry(&head, dir, procname, strlen(procname));
+ link = find_entry(&tmp_head, dir, procname, strlen(procname));
if (!link)
return false;
if (S_ISDIR(link->mode) && S_ISDIR(entry->mode))
@@ -1205,10 +1246,10 @@ static bool get_links(struct ctl_dir *dir,
}
/* The checks passed. Increase the registration count on the links */
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, header) {
const char *procname = entry->procname;
- link = find_entry(&head, dir, procname, strlen(procname));
- head->nreg++;
+ link = find_entry(&tmp_head, dir, procname, strlen(procname));
+ tmp_head->nreg++;
}
return true;
}
@@ -1216,7 +1257,7 @@ static bool get_links(struct ctl_dir *dir,
static int insert_links(struct ctl_table_header *head)
{
struct ctl_table_set *root_set = &sysctl_table_root.default_set;
- struct ctl_dir *core_parent = NULL;
+ struct ctl_dir *core_parent;
struct ctl_table_header *links;
int err;
@@ -1227,13 +1268,13 @@ static int insert_links(struct ctl_table_header *head)
if (IS_ERR(core_parent))
return 0;
- if (get_links(core_parent, head->ctl_table, head->root))
+ if (get_links(core_parent, head, head->root))
return 0;
core_parent->header.nreg++;
spin_unlock(&sysctl_lock);
- links = new_links(core_parent, head->ctl_table, head->root);
+ links = new_links(core_parent, head);
spin_lock(&sysctl_lock);
err = -ENOMEM;
@@ -1241,7 +1282,7 @@ static int insert_links(struct ctl_table_header *head)
goto out;
err = 0;
- if (get_links(core_parent, head->ctl_table, head->root)) {
+ if (get_links(core_parent, head, head->root)) {
kfree(links);
goto out;
}
@@ -1254,34 +1295,64 @@ out:
return err;
}
+/* Find the directory for the ctl_table. If one is not found create it. */
+static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path)
+{
+ const char *name, *nextname;
+
+ for (name = path; name; name = nextname) {
+ int namelen;
+ nextname = strchr(name, '/');
+ if (nextname) {
+ namelen = nextname - name;
+ nextname++;
+ } else {
+ namelen = strlen(name);
+ }
+ if (namelen == 0)
+ continue;
+
+ /*
+ * namelen ensures if name is "foo/bar/yay" only foo is
+ * registered first. We traverse as if using mkdir -p and
+ * return a ctl_dir for the last directory entry.
+ */
+ dir = get_subdir(dir, name, namelen);
+ if (IS_ERR(dir))
+ break;
+ }
+ return dir;
+}
+
/**
* __register_sysctl_table - register a leaf sysctl table
* @set: Sysctl tree to register on
* @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
+ *
+ * @table: the top-level table structure. This table should not be free'd
+ * after registration. So it should not be used on stack. It can either
+ * be a global or dynamically allocated by the caller and free'd later
+ * after sysctl unregistration.
+ * @table_size : The number of elements in table
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
+ * array.
*
* The members of the &struct ctl_table structure are used as follows:
- *
* procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
* enter a sysctl file
- *
- * data - a pointer to data for use by proc_handler
- *
- * maxlen - the maximum size in bytes of the data
- *
- * mode - the file permissions for the /proc/sys file
- *
- * child - must be %NULL.
- *
+ * data - a pointer to data for use by proc_handler
+ * maxlen - the maximum size in bytes of the data
+ * mode - the file permissions for the /proc/sys file
+ * type - Defines the target type (described in struct definition)
* proc_handler - the text handler routine (described below)
*
* extra1, extra2 - extra pointers usable by the proc handler routines
+ * XXX: we should eventually modify these to use long min / max [0]
+ * [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org
*
* Leaf nodes in the sysctl tree will be represented by a single file
- * under /proc; non-leaf nodes will be represented by directories.
+ * under /proc; non-leaf nodes are not allowed.
*
* There must be a proc_handler routine for any terminal nodes.
* Several default handlers are available to cover common cases -
@@ -1298,53 +1369,32 @@ out:
*/
struct ctl_table_header *__register_sysctl_table(
struct ctl_table_set *set,
- const char *path, struct ctl_table *table)
+ const char *path, const struct ctl_table *table, size_t table_size)
{
struct ctl_table_root *root = set->dir.header.root;
struct ctl_table_header *header;
- const char *name, *nextname;
struct ctl_dir *dir;
- struct ctl_table *entry;
struct ctl_node *node;
- int nr_entries = 0;
-
- for (entry = table; entry->procname; entry++)
- nr_entries++;
header = kzalloc(sizeof(struct ctl_table_header) +
- sizeof(struct ctl_node)*nr_entries, GFP_KERNEL);
+ sizeof(struct ctl_node)*table_size, GFP_KERNEL_ACCOUNT);
if (!header)
return NULL;
node = (struct ctl_node *)(header + 1);
- init_header(header, root, set, node, table);
- if (sysctl_check_table(path, table))
+ init_header(header, root, set, node, table, table_size);
+ if (sysctl_check_table(path, header))
goto fail;
spin_lock(&sysctl_lock);
dir = &set->dir;
- /* Reference moved down the diretory tree get_subdir */
+ /* Reference moved down the directory tree get_subdir */
dir->header.nreg++;
spin_unlock(&sysctl_lock);
- /* Find the directory for the ctl_table */
- for (name = path; name; name = nextname) {
- int namelen;
- nextname = strchr(name, '/');
- if (nextname) {
- namelen = nextname - name;
- nextname++;
- } else {
- namelen = strlen(name);
- }
- if (namelen == 0)
- continue;
-
- dir = get_subdir(dir, name, namelen);
- if (IS_ERR(dir))
- goto fail;
- }
-
+ dir = sysctl_mkdir_p(dir, path);
+ if (IS_ERR(dir))
+ goto fail;
spin_lock(&sysctl_lock);
if (insert_header(dir, header))
goto fail_put_dir_locked;
@@ -1359,247 +1409,75 @@ fail_put_dir_locked:
spin_unlock(&sysctl_lock);
fail:
kfree(header);
- dump_stack();
return NULL;
}
/**
- * register_sysctl - register a sysctl table
- * @path: The path to the directory the sysctl table is in.
- * @table: the table structure
+ * register_sysctl_sz - register a sysctl table
+ * @path: The path to the directory the sysctl table is in. If the path
+ * doesn't exist we will create it for you.
+ * @table: the table structure. The calller must ensure the life of the @table
+ * will be kept during the lifetime use of the syctl. It must not be freed
+ * until unregister_sysctl_table() is called with the given returned table
+ * with this registration. If your code is non modular then you don't need
+ * to call unregister_sysctl_table() and can instead use something like
+ * register_sysctl_init() which does not care for the result of the syctl
+ * registration.
+ * @table_size: The number of elements in table.
*
* Register a sysctl table. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
*
* See __register_sysctl_table for more details.
*/
-struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table)
+struct ctl_table_header *register_sysctl_sz(const char *path, const struct ctl_table *table,
+ size_t table_size)
{
return __register_sysctl_table(&sysctl_table_root.default_set,
- path, table);
-}
-EXPORT_SYMBOL(register_sysctl);
-
-static char *append_path(const char *path, char *pos, const char *name)
-{
- int namelen;
- namelen = strlen(name);
- if (((pos - path) + namelen + 2) >= PATH_MAX)
- return NULL;
- memcpy(pos, name, namelen);
- pos[namelen] = '/';
- pos[namelen + 1] = '\0';
- pos += namelen + 1;
- return pos;
-}
-
-static int count_subheaders(struct ctl_table *table)
-{
- int has_files = 0;
- int nr_subheaders = 0;
- struct ctl_table *entry;
-
- /* special case: no directory and empty directory */
- if (!table || !table->procname)
- return 1;
-
- for (entry = table; entry->procname; entry++) {
- if (entry->child)
- nr_subheaders += count_subheaders(entry->child);
- else
- has_files = 1;
- }
- return nr_subheaders + has_files;
-}
-
-static int register_leaf_sysctl_tables(const char *path, char *pos,
- struct ctl_table_header ***subheader, struct ctl_table_set *set,
- struct ctl_table *table)
-{
- struct ctl_table *ctl_table_arg = NULL;
- struct ctl_table *entry, *files;
- int nr_files = 0;
- int nr_dirs = 0;
- int err = -ENOMEM;
-
- for (entry = table; entry->procname; entry++) {
- if (entry->child)
- nr_dirs++;
- else
- nr_files++;
- }
-
- files = table;
- /* If there are mixed files and directories we need a new table */
- if (nr_dirs && nr_files) {
- struct ctl_table *new;
- files = kcalloc(nr_files + 1, sizeof(struct ctl_table),
- GFP_KERNEL);
- if (!files)
- goto out;
-
- ctl_table_arg = files;
- for (new = files, entry = table; entry->procname; entry++) {
- if (entry->child)
- continue;
- *new = *entry;
- new++;
- }
- }
-
- /* Register everything except a directory full of subdirectories */
- if (nr_files || !nr_dirs) {
- struct ctl_table_header *header;
- header = __register_sysctl_table(set, path, files);
- if (!header) {
- kfree(ctl_table_arg);
- goto out;
- }
-
- /* Remember if we need to free the file table */
- header->ctl_table_arg = ctl_table_arg;
- **subheader = header;
- (*subheader)++;
- }
-
- /* Recurse into the subdirectories. */
- for (entry = table; entry->procname; entry++) {
- char *child_pos;
-
- if (!entry->child)
- continue;
-
- err = -ENAMETOOLONG;
- child_pos = append_path(path, pos, entry->procname);
- if (!child_pos)
- goto out;
-
- err = register_leaf_sysctl_tables(path, child_pos, subheader,
- set, entry->child);
- pos[0] = '\0';
- if (err)
- goto out;
- }
- err = 0;
-out:
- /* On failure our caller will unregister all registered subheaders */
- return err;
+ path, table, table_size);
}
+EXPORT_SYMBOL(register_sysctl_sz);
/**
- * __register_sysctl_paths - register a sysctl table hierarchy
- * @set: Sysctl tree to register on
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
+ * __register_sysctl_init() - register sysctl table to path
+ * @path: path name for sysctl base. If that path doesn't exist we will create
+ * it for you.
+ * @table: This is the sysctl table that needs to be registered to the path.
+ * The caller must ensure the life of the @table will be kept during the
+ * lifetime use of the sysctl.
+ * @table_name: The name of sysctl table, only used for log printing when
+ * registration fails
+ * @table_size: The number of elements in table
*
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
+ * The sysctl interface is used by userspace to query or modify at runtime
+ * a predefined value set on a variable. These variables however have default
+ * values pre-set. Code which depends on these variables will always work even
+ * if register_sysctl() fails. If register_sysctl() fails you'd just loose the
+ * ability to query or modify the sysctls dynamically at run time. Chances of
+ * register_sysctl() failing on init are extremely low, and so for both reasons
+ * this function does not return any error as it is used by initialization code.
*
- * See __register_sysctl_table for more details.
+ * Context: if your base directory does not exist it will be created for you.
*/
-struct ctl_table_header *__register_sysctl_paths(
- struct ctl_table_set *set,
- const struct ctl_path *path, struct ctl_table *table)
+void __init __register_sysctl_init(const char *path, const struct ctl_table *table,
+ const char *table_name, size_t table_size)
{
- struct ctl_table *ctl_table_arg = table;
- int nr_subheaders = count_subheaders(table);
- struct ctl_table_header *header = NULL, **subheaders, **subheader;
- const struct ctl_path *component;
- char *new_path, *pos;
-
- pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!new_path)
- return NULL;
-
- pos[0] = '\0';
- for (component = path; component->procname; component++) {
- pos = append_path(new_path, pos, component->procname);
- if (!pos)
- goto out;
- }
- while (table->procname && table->child && !table[1].procname) {
- pos = append_path(new_path, pos, table->procname);
- if (!pos)
- goto out;
- table = table->child;
- }
- if (nr_subheaders == 1) {
- header = __register_sysctl_table(set, new_path, table);
- if (header)
- header->ctl_table_arg = ctl_table_arg;
- } else {
- header = kzalloc(sizeof(*header) +
- sizeof(*subheaders)*nr_subheaders, GFP_KERNEL);
- if (!header)
- goto out;
+ struct ctl_table_header *hdr = register_sysctl_sz(path, table, table_size);
- subheaders = (struct ctl_table_header **) (header + 1);
- subheader = subheaders;
- header->ctl_table_arg = ctl_table_arg;
-
- if (register_leaf_sysctl_tables(new_path, pos, &subheader,
- set, table))
- goto err_register_leaves;
- }
-
-out:
- kfree(new_path);
- return header;
-
-err_register_leaves:
- while (subheader > subheaders) {
- struct ctl_table_header *subh = *(--subheader);
- struct ctl_table *table = subh->ctl_table_arg;
- unregister_sysctl_table(subh);
- kfree(table);
+ if (unlikely(!hdr)) {
+ pr_err("failed when register_sysctl_sz %s to %s\n", table_name, path);
+ return;
}
- kfree(header);
- header = NULL;
- goto out;
+ kmemleak_not_leak(hdr);
}
-/**
- * register_sysctl_table_path - register a sysctl table hierarchy
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See __register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
- struct ctl_table *table)
-{
- return __register_sysctl_paths(&sysctl_table_root.default_set,
- path, table);
-}
-EXPORT_SYMBOL(register_sysctl_paths);
-
-/**
- * register_sysctl_table - register a sysctl table hierarchy
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
-{
- static const struct ctl_path null_path[] = { {} };
-
- return register_sysctl_paths(null_path, table);
-}
-EXPORT_SYMBOL(register_sysctl_table);
-
static void put_links(struct ctl_table_header *header)
{
struct ctl_table_set *root_set = &sysctl_table_root.default_set;
struct ctl_table_root *root = header->root;
struct ctl_dir *parent = header->parent;
struct ctl_dir *core_parent;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
if (header->set == root_set)
return;
@@ -1608,9 +1486,9 @@ static void put_links(struct ctl_table_header *header)
if (IS_ERR(core_parent))
return;
- for (entry = header->ctl_table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, header) {
struct ctl_table_header *link_head;
- struct ctl_table *link;
+ const struct ctl_table *link;
const char *name = entry->procname;
link = find_entry(&link_head, core_parent, name, strlen(name));
@@ -1622,7 +1500,7 @@ static void put_links(struct ctl_table_header *header)
else {
pr_err("sysctl link missing during unregister: ");
sysctl_print_dir(parent);
- pr_cont("/%s\n", name);
+ pr_cont("%s\n", name);
}
}
}
@@ -1648,35 +1526,18 @@ static void drop_sysctl_table(struct ctl_table_header *header)
/**
* unregister_sysctl_table - unregister a sysctl table hierarchy
- * @header: the header returned from register_sysctl_table
+ * @header: the header returned from register_sysctl or __register_sysctl_table
*
* Unregisters the sysctl table and all children. proc entries may not
* actually be removed until they are no longer used by anyone.
*/
void unregister_sysctl_table(struct ctl_table_header * header)
{
- int nr_subheaders;
might_sleep();
if (header == NULL)
return;
- nr_subheaders = count_subheaders(header->ctl_table_arg);
- if (unlikely(nr_subheaders > 1)) {
- struct ctl_table_header **subheaders;
- int i;
-
- subheaders = (struct ctl_table_header **)(header + 1);
- for (i = nr_subheaders -1; i >= 0; i--) {
- struct ctl_table_header *subh = subheaders[i];
- struct ctl_table *table = subh->ctl_table_arg;
- unregister_sysctl_table(subh);
- kfree(table);
- }
- kfree(header);
- return;
- }
-
spin_lock(&sysctl_lock);
drop_sysctl_table(header);
spin_unlock(&sysctl_lock);
@@ -1689,7 +1550,7 @@ void setup_sysctl_set(struct ctl_table_set *set,
{
memset(set, 0, sizeof(*set));
set->is_seen = is_seen;
- init_header(&set->dir.header, root, set, NULL, root_table);
+ init_header(&set->dir.header, root, set, NULL, root_table, 1);
}
void retire_sysctl_set(struct ctl_table_set *set)
@@ -1706,7 +1567,7 @@ int __init proc_sys_init(void)
proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations;
proc_sys_root->nlink = 0;
- return sysctl_init();
+ return sysctl_init_bases();
}
struct sysctl_alias {
@@ -1728,7 +1589,6 @@ static const struct sysctl_alias sysctl_aliases[] = {
{"hung_task_panic", "kernel.hung_task_panic" },
{"numa_zonelist_order", "vm.numa_zonelist_order" },
{"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" },
- {"softlockup_panic", "kernel.softlockup_panic" },
{ }
};
@@ -1744,6 +1604,13 @@ static const char *sysctl_find_alias(char *param)
return NULL;
}
+bool sysctl_is_alias(char *param)
+{
+ const char *alias = sysctl_find_alias(param);
+
+ return alias != NULL;
+}
+
/* Set sysctl value passed on kernel command line. */
static int process_sysctl_arg(char *param, char *val,
const char *unused, void *arg)
@@ -1770,6 +1637,12 @@ static int process_sysctl_arg(char *param, char *val,
return 0;
}
+ if (!val)
+ return -EINVAL;
+ len = strlen(val);
+ if (len == 0)
+ return -EINVAL;
+
/*
* To set sysctl options, we use a temporary mount of proc, look up the
* respective sys/ file and write to it. To avoid mounting it when no
@@ -1797,7 +1670,7 @@ static int process_sysctl_arg(char *param, char *val,
panic("%s: Failed to allocate path for %s\n", __func__, param);
strreplace(path, '.', '/');
- file = file_open_root((*proc_mnt)->mnt_root, *proc_mnt, path, O_WRONLY, 0);
+ file = file_open_root_mnt(*proc_mnt, path, O_WRONLY, 0);
if (IS_ERR(file)) {
err = PTR_ERR(file);
if (err == -ENOENT)
@@ -1811,7 +1684,6 @@ static int process_sysctl_arg(char *param, char *val,
file, param, val);
goto out;
}
- len = strlen(val);
wret = kernel_write(file, val, len, &pos);
if (wret < 0) {
err = wret;
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index c69ff191e5d8..5c6a5ceab2f1 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -4,8 +4,6 @@
*
* Copyright 1997, Theodore Ts'o
*/
-
-#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/errno.h>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 5e444d4f9717..d8ca41d823e4 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -6,9 +6,6 @@
*
* proc root directory handling functions
*/
-
-#include <linux/uaccess.h>
-
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
@@ -41,12 +38,14 @@ enum proc_param {
Opt_gid,
Opt_hidepid,
Opt_subset,
+ Opt_pidns,
};
static const struct fs_parameter_spec proc_fs_parameters[] = {
- fsparam_u32("gid", Opt_gid),
+ fsparam_u32("gid", Opt_gid),
fsparam_string("hidepid", Opt_hidepid),
fsparam_string("subset", Opt_subset),
+ fsparam_file_or_string("pidns", Opt_pidns),
{}
};
@@ -112,11 +111,66 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value)
return 0;
}
+#ifdef CONFIG_PID_NS
+static int proc_parse_pidns_param(struct fs_context *fc,
+ struct fs_parameter *param,
+ struct fs_parse_result *result)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+ struct pid_namespace *target, *active = task_active_pid_ns(current);
+ struct ns_common *ns;
+ struct file *ns_filp __free(fput) = NULL;
+
+ switch (param->type) {
+ case fs_value_is_file:
+ /* came through fsconfig, steal the file reference */
+ ns_filp = no_free_ptr(param->file);
+ break;
+ case fs_value_is_string:
+ ns_filp = filp_open(param->string, O_RDONLY, 0);
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ break;
+ }
+ if (!ns_filp)
+ ns_filp = ERR_PTR(-EBADF);
+ if (IS_ERR(ns_filp)) {
+ errorfc(fc, "could not get file from pidns argument");
+ return PTR_ERR(ns_filp);
+ }
+
+ if (!proc_ns_file(ns_filp))
+ return invalfc(fc, "pidns argument is not an nsfs file");
+ ns = get_proc_ns(file_inode(ns_filp));
+ if (ns->ns_type != CLONE_NEWPID)
+ return invalfc(fc, "pidns argument is not a pidns file");
+ target = container_of(ns, struct pid_namespace, ns);
+
+ /*
+ * pidns= is shorthand for joining the pidns to get a fsopen fd, so the
+ * permission model should be the same as pidns_install().
+ */
+ if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) {
+ errorfc(fc, "insufficient permissions to set pidns");
+ return -EPERM;
+ }
+ if (!pidns_is_ancestor(target, active))
+ return invalfc(fc, "cannot set pidns to non-descendant pidns");
+
+ put_pid_ns(ctx->pid_ns);
+ ctx->pid_ns = get_pid_ns(target);
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
+ return 0;
+}
+#endif /* CONFIG_PID_NS */
+
static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct proc_fs_context *ctx = fc->fs_private;
struct fs_parse_result result;
- int opt;
+ int opt, err;
opt = fs_parse(fc, proc_fs_parameters, param, &result);
if (opt < 0)
@@ -128,14 +182,38 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_hidepid:
- if (proc_parse_hidepid_param(fc, param))
- return -EINVAL;
+ err = proc_parse_hidepid_param(fc, param);
+ if (err)
+ return err;
break;
case Opt_subset:
- if (proc_parse_subset_param(fc, param->string) < 0)
- return -EINVAL;
+ err = proc_parse_subset_param(fc, param->string);
+ if (err)
+ return err;
+ break;
+
+ case Opt_pidns:
+#ifdef CONFIG_PID_NS
+ /*
+ * We would have to RCU-protect every proc_pid_ns() or
+ * proc_sb_info() access if we allowed this to be reconfigured
+ * for an existing procfs instance. Luckily, procfs instances
+ * are cheap to create, and mount-beneath would let you
+ * atomically replace an instance even with overmounts.
+ */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ errorfc(fc, "cannot reconfigure pidns for existing procfs");
+ return -EBUSY;
+ }
+ err = proc_parse_pidns_param(fc, param, &result);
+ if (err)
+ return err;
break;
+#else
+ errorfc(fc, "pidns mount flag not supported on this system");
+ return -EOPNOTSUPP;
+#endif
default:
return -EINVAL;
@@ -157,6 +235,11 @@ static void proc_apply_options(struct proc_fs_info *fs_info,
fs_info->hide_pid = ctx->hidepid;
if (ctx->mask & (1 << Opt_subset))
fs_info->pidonly = ctx->pidonly;
+ if (ctx->mask & (1 << Opt_pidns) &&
+ !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) {
+ put_pid_ns(fs_info->pid_ns);
+ fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
+ }
}
static int proc_fill_super(struct super_block *s, struct fs_context *fc)
@@ -191,7 +274,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
/* procfs dentries and inodes don't require IO to create */
- s->s_shrink.seeks = 0;
+ s->s_shrink->seeks = 0;
pde_get(&proc_root);
root_inode = proc_get_inode(s, &proc_root);
@@ -264,17 +347,11 @@ static void proc_kill_sb(struct super_block *sb)
{
struct proc_fs_info *fs_info = proc_sb_info(sb);
- if (!fs_info) {
- kill_anon_super(sb);
- return;
- }
-
- dput(fs_info->proc_self);
- dput(fs_info->proc_thread_self);
-
kill_anon_super(sb);
- put_pid_ns(fs_info->pid_ns);
- kfree(fs_info);
+ if (fs_info) {
+ put_pid_ns(fs_info->pid_ns);
+ kfree_rcu(fs_info, rcu);
+ }
}
static struct file_system_type proc_fs_type = {
@@ -305,13 +382,20 @@ void __init proc_root_init(void)
proc_mkdir("bus", NULL);
proc_sys_init();
+ /*
+ * Last things last. It is not like userspace processes eager
+ * to open /proc files exist at this point but register last
+ * anyway.
+ */
register_filesystem(&proc_fs_type);
}
-static int proc_root_getattr(const struct path *path, struct kstat *stat,
+static int proc_root_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
- generic_fillattr(d_inode(path->dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry),
+ stat);
stat->nlink = proc_root.nlink + nr_processes();
return 0;
}
@@ -359,12 +443,12 @@ static const struct inode_operations proc_root_inode_operations = {
* This is the root "inode" in the /proc tree..
*/
struct proc_dir_entry proc_root = {
- .low_ino = PROC_ROOT_INO,
- .namelen = 5,
- .mode = S_IFDIR | S_IRUGO | S_IXUGO,
- .nlink = 2,
+ .low_ino = PROCFS_ROOT_INO,
+ .namelen = 5,
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ .nlink = 2,
.refcnt = REFCOUNT_INIT(1),
- .proc_iops = &proc_root_inode_operations,
+ .proc_iops = &proc_root_inode_operations,
.proc_dir_ops = &proc_root_operations,
.parent = &proc_root,
.subdir = RB_ROOT,
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 72cd69bcaf4a..62d2c0cfe35c 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -31,12 +31,11 @@ static const struct inode_operations proc_self_inode_operations = {
.get_link = proc_self_get_link,
};
-static unsigned self_inum __ro_after_init;
+unsigned self_inum __ro_after_init;
int proc_setup_self(struct super_block *s)
{
struct inode *root_inode = d_inode(s->s_root);
- struct proc_fs_info *fs_info = proc_sb_info(s);
struct dentry *self;
int ret = -ENOMEM;
@@ -46,23 +45,20 @@ int proc_setup_self(struct super_block *s)
struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = self_inum;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ simple_inode_init_ts(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_op = &proc_self_inode_operations;
- d_add(self, inode);
+ d_make_persistent(self, inode);
ret = 0;
- } else {
- dput(self);
}
+ dput(self);
}
inode_unlock(root_inode);
if (ret)
pr_err("proc_fill_super: can't allocate /proc/self\n");
- else
- fs_info->proc_self = self;
return ret;
}
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 12901dcf57e2..04bb29721419 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -3,6 +3,7 @@
#include <linux/kernel_stat.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include "internal.h"
/*
* /proc/softirqs ... display the number of softirqs
@@ -19,7 +20,7 @@ static int show_softirqs(struct seq_file *p, void *v)
for (i = 0; i < NR_SOFTIRQS; i++) {
seq_printf(p, "%12s:", softirq_to_name[i]);
for_each_possible_cpu(j)
- seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
+ seq_put_decimal_ull_width(p, " ", kstat_softirqs_cpu(i, j), 10);
seq_putc(p, '\n');
}
return 0;
@@ -27,7 +28,10 @@ static int show_softirqs(struct seq_file *p, void *v)
static int __init proc_softirqs_init(void)
{
- proc_create_single("softirqs", 0, NULL, show_softirqs);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("softirqs", 0, NULL, show_softirqs);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_softirqs_init);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 46b3293015fe..8b444e862319 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -10,6 +10,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/time.h>
+#include <linux/time_namespace.h>
#include <linux/irqnr.h>
#include <linux/sched/cputime.h>
#include <linux/tick.h>
@@ -21,31 +22,7 @@
#define arch_irq_stat() 0
#endif
-#ifdef arch_idle_time
-
-static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
-{
- u64 idle;
-
- idle = kcs->cpustat[CPUTIME_IDLE];
- if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
- idle += arch_idle_time(cpu);
- return idle;
-}
-
-static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
-{
- u64 iowait;
-
- iowait = kcs->cpustat[CPUTIME_IOWAIT];
- if (cpu_online(cpu) && nr_iowait_cpu(cpu))
- iowait += arch_idle_time(cpu);
- return iowait;
-}
-
-#else
-
-static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
+u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle, idle_usecs = -1ULL;
@@ -77,8 +54,6 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
return iowait;
}
-#endif
-
static void show_irq_gap(struct seq_file *p, unsigned int gap)
{
static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0";
@@ -101,7 +76,7 @@ static void show_all_irqs(struct seq_file *p)
seq_put_decimal_ull(p, " ", kstat_irqs_usr(i));
next = i + 1;
}
- show_irq_gap(p, nr_irqs - next);
+ show_irq_gap(p, irq_get_nr_irqs() - next);
}
static int show_stat(struct seq_file *p, void *v)
@@ -118,6 +93,8 @@ static int show_stat(struct seq_file *p, void *v)
irq = softirq = steal = 0;
guest = guest_nice = 0;
getboottime64(&boottime);
+ /* shift boot timestamp according to the timens offset */
+ timens_sub_boottime(&boottime);
for_each_possible_cpu(i) {
struct kernel_cpustat kcpustat;
@@ -197,8 +174,8 @@ static int show_stat(struct seq_file *p, void *v)
"\nctxt %llu\n"
"btime %llu\n"
"processes %lu\n"
- "procs_running %lu\n"
- "procs_blocked %lu\n",
+ "procs_running %u\n"
+ "procs_blocked %u\n",
nr_context_switches(),
(unsigned long long)boottime.tv_sec,
total_forks,
@@ -219,14 +196,14 @@ static int stat_open(struct inode *inode, struct file *file)
unsigned int size = 1024 + 128 * num_online_cpus();
/* minimum size to display an interrupt count : 2 bytes */
- size += 2 * nr_irqs;
+ size += 2 * irq_get_nr_irqs();
return single_open_size(file, show_stat, NULL, size);
}
static const struct proc_ops stat_proc_ops = {
.proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = stat_open,
- .proc_read = seq_read,
+ .proc_read_iter = seq_read_iter,
.proc_lseek = seq_lseek,
.proc_release = single_release,
};
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index dbda4499a859..81dfc26bfae8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,9 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/pagewalk.h>
-#include <linux/vmacache.h>
+#include <linux/mm_inline.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/mount.h>
+#include <linux/ksm.h>
#include <linux/seq_file.h>
#include <linux/highmem.h>
#include <linux/ptrace.h>
@@ -13,18 +14,24 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/sched/mm.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
#include <linux/uaccess.h>
#include <linux/pkeys.h>
+#include <linux/minmax.h>
+#include <linux/overflow.h>
+#include <linux/buildid.h>
#include <asm/elf.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include "internal.h"
+#define SENTINEL_VMA_END -1
+#define SENTINEL_VMA_GATE -2
+
#define SEQ_PUT_DEC(str, val) \
seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8)
void task_mem(struct seq_file *m, struct mm_struct *mm)
@@ -32,9 +39,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
unsigned long text, lib, swap, anon, file, shmem;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
- anon = get_mm_counter(mm, MM_ANONPAGES);
- file = get_mm_counter(mm, MM_FILEPAGES);
- shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+ anon = get_mm_counter_sum(mm, MM_ANONPAGES);
+ file = get_mm_counter_sum(mm, MM_FILEPAGES);
+ shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES);
/*
* Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -55,7 +62,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
text = min(text, mm->exec_vm << PAGE_SHIFT);
lib = (mm->exec_vm << PAGE_SHIFT) - text;
- swap = get_mm_counter(mm, MM_SWAPENTS);
+ swap = get_mm_counter_sum(mm, MM_SWAPENTS);
SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
@@ -88,12 +95,12 @@ unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
- *shared = get_mm_counter(mm, MM_FILEPAGES) +
- get_mm_counter(mm, MM_SHMEMPAGES);
+ *shared = get_mm_counter_sum(mm, MM_FILEPAGES) +
+ get_mm_counter_sum(mm, MM_SHMEMPAGES);
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
*data = mm->data_vm + mm->stack_vm;
- *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
+ *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES);
return mm->total_vm;
}
@@ -123,72 +130,211 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
}
#endif
+#ifdef CONFIG_PER_VMA_LOCK
+
+static void reset_lock_ctx(struct proc_maps_locking_ctx *lock_ctx)
+{
+ lock_ctx->locked_vma = NULL;
+ lock_ctx->mmap_locked = false;
+}
+
+static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx)
+{
+ if (lock_ctx->locked_vma) {
+ vma_end_read(lock_ctx->locked_vma);
+ lock_ctx->locked_vma = NULL;
+ }
+}
+
+static const struct seq_operations proc_pid_maps_op;
+
+static inline bool lock_vma_range(struct seq_file *m,
+ struct proc_maps_locking_ctx *lock_ctx)
+{
+ /*
+ * smaps and numa_maps perform page table walk, therefore require
+ * mmap_lock but maps can be read with locking just the vma and
+ * walking the vma tree under rcu read protection.
+ */
+ if (m->op != &proc_pid_maps_op) {
+ if (mmap_read_lock_killable(lock_ctx->mm))
+ return false;
+
+ lock_ctx->mmap_locked = true;
+ } else {
+ rcu_read_lock();
+ reset_lock_ctx(lock_ctx);
+ }
+
+ return true;
+}
+
+static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
+{
+ if (lock_ctx->mmap_locked) {
+ mmap_read_unlock(lock_ctx->mm);
+ } else {
+ unlock_ctx_vma(lock_ctx);
+ rcu_read_unlock();
+ }
+}
+
+static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
+ loff_t last_pos)
+{
+ struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
+ struct vm_area_struct *vma;
+
+ if (lock_ctx->mmap_locked)
+ return vma_next(&priv->iter);
+
+ unlock_ctx_vma(lock_ctx);
+ vma = lock_next_vma(lock_ctx->mm, &priv->iter, last_pos);
+ if (!IS_ERR_OR_NULL(vma))
+ lock_ctx->locked_vma = vma;
+
+ return vma;
+}
+
+static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
+ loff_t pos)
+{
+ struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
+
+ if (lock_ctx->mmap_locked)
+ return false;
+
+ rcu_read_unlock();
+ mmap_read_lock(lock_ctx->mm);
+ /* Reinitialize the iterator after taking mmap_lock */
+ vma_iter_set(&priv->iter, pos);
+ lock_ctx->mmap_locked = true;
+
+ return true;
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool lock_vma_range(struct seq_file *m,
+ struct proc_maps_locking_ctx *lock_ctx)
+{
+ return mmap_read_lock_killable(lock_ctx->mm) == 0;
+}
+
+static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
+{
+ mmap_read_unlock(lock_ctx->mm);
+}
+
+static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
+ loff_t last_pos)
+{
+ return vma_next(&priv->iter);
+}
+
+static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
+ loff_t pos)
+{
+ return false;
+}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
+static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos)
+{
+ struct proc_maps_private *priv = m->private;
+ struct vm_area_struct *vma;
+
+retry:
+ vma = get_next_vma(priv, *ppos);
+ /* EINTR of EAGAIN is possible */
+ if (IS_ERR(vma)) {
+ if (PTR_ERR(vma) == -EAGAIN && fallback_to_mmap_lock(priv, *ppos))
+ goto retry;
+
+ return vma;
+ }
+
+ /* Store previous position to be able to restart if needed */
+ priv->last_pos = *ppos;
+ if (vma) {
+ /*
+ * Track the end of the reported vma to ensure position changes
+ * even if previous vma was merged with the next vma and we
+ * found the extended vma with the same vm_start.
+ */
+ *ppos = vma->vm_end;
+ } else {
+ *ppos = SENTINEL_VMA_GATE;
+ vma = get_gate_vma(priv->lock_ctx.mm);
+ }
+
+ return vma;
+}
+
static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
- unsigned long last_addr = *ppos;
+ struct proc_maps_locking_ctx *lock_ctx;
+ loff_t last_addr = *ppos;
struct mm_struct *mm;
- struct vm_area_struct *vma;
/* See m_next(). Zero at the start or after lseek. */
- if (last_addr == -1UL)
+ if (last_addr == SENTINEL_VMA_END)
return NULL;
priv->task = get_proc_task(priv->inode);
if (!priv->task)
return ERR_PTR(-ESRCH);
- mm = priv->mm;
+ lock_ctx = &priv->lock_ctx;
+ mm = lock_ctx->mm;
if (!mm || !mmget_not_zero(mm)) {
put_task_struct(priv->task);
priv->task = NULL;
return NULL;
}
- if (mmap_read_lock_killable(mm)) {
+ if (!lock_vma_range(m, lock_ctx)) {
mmput(mm);
put_task_struct(priv->task);
priv->task = NULL;
return ERR_PTR(-EINTR);
}
+ /*
+ * Reset current position if last_addr was set before
+ * and it's not a sentinel.
+ */
+ if (last_addr > 0)
+ *ppos = last_addr = priv->last_pos;
+ vma_iter_init(&priv->iter, mm, (unsigned long)last_addr);
hold_task_mempolicy(priv);
- priv->tail_vma = get_gate_vma(mm);
+ if (last_addr == SENTINEL_VMA_GATE)
+ return get_gate_vma(mm);
- vma = find_vma(mm, last_addr);
- if (vma)
- return vma;
-
- return priv->tail_vma;
+ return proc_get_vma(m, ppos);
}
static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
{
- struct proc_maps_private *priv = m->private;
- struct vm_area_struct *next, *vma = v;
-
- if (vma == priv->tail_vma)
- next = NULL;
- else if (vma->vm_next)
- next = vma->vm_next;
- else
- next = priv->tail_vma;
-
- *ppos = next ? next->vm_start : -1UL;
-
- return next;
+ if (*ppos == SENTINEL_VMA_GATE) {
+ *ppos = SENTINEL_VMA_END;
+ return NULL;
+ }
+ return proc_get_vma(m, ppos);
}
static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
- struct mm_struct *mm = priv->mm;
+ struct mm_struct *mm = priv->lock_ctx.mm;
if (!priv->task)
return;
release_task_mempolicy(priv);
- mmap_read_unlock(mm);
+ unlock_vma_range(&priv->lock_ctx);
mmput(mm);
put_task_struct(priv->task);
priv->task = NULL;
@@ -203,9 +349,9 @@ static int proc_maps_open(struct inode *inode, struct file *file,
return -ENOMEM;
priv->inode = inode;
- priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(priv->mm)) {
- int err = PTR_ERR(priv->mm);
+ priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(priv->lock_ctx.mm)) {
+ int err = PTR_ERR(priv->lock_ctx.mm);
seq_release_private(inode, file);
return err;
@@ -219,8 +365,8 @@ static int proc_map_release(struct inode *inode, struct file *file)
struct seq_file *seq = file->private_data;
struct proc_maps_private *priv = seq->private;
- if (priv->mm)
- mmdrop(priv->mm);
+ if (priv->lock_ctx.mm)
+ mmdrop(priv->lock_ctx.mm);
return seq_release_private(inode, file);
}
@@ -232,19 +378,65 @@ static int do_maps_open(struct inode *inode, struct file *file,
sizeof(struct proc_maps_private));
}
-/*
- * Indicate if the VMA is a stack for the given task; for
- * /proc/PID/maps that is the stack of the main task.
- */
-static int is_stack(struct vm_area_struct *vma)
+static void get_vma_name(struct vm_area_struct *vma,
+ const struct path **path,
+ const char **name,
+ const char **name_fmt)
{
+ struct anon_vma_name *anon_name = vma->vm_mm ? anon_vma_name(vma) : NULL;
+
+ *name = NULL;
+ *path = NULL;
+ *name_fmt = NULL;
+
/*
- * We make no effort to guess what a given thread considers to be
- * its "stack". It's not even well-defined for programs written
- * languages like Go.
+ * Print the dentry name for named mappings, and a
+ * special [heap] marker for the heap:
*/
- return vma->vm_start <= vma->vm_mm->start_stack &&
- vma->vm_end >= vma->vm_mm->start_stack;
+ if (vma->vm_file) {
+ /*
+ * If user named this anon shared memory via
+ * prctl(PR_SET_VMA ..., use the provided name.
+ */
+ if (anon_name) {
+ *name_fmt = "[anon_shmem:%s]";
+ *name = anon_name->name;
+ } else {
+ *path = file_user_path(vma->vm_file);
+ }
+ return;
+ }
+
+ if (vma->vm_ops && vma->vm_ops->name) {
+ *name = vma->vm_ops->name(vma);
+ if (*name)
+ return;
+ }
+
+ *name = arch_vma_name(vma);
+ if (*name)
+ return;
+
+ if (!vma->vm_mm) {
+ *name = "[vdso]";
+ return;
+ }
+
+ if (vma_is_initial_heap(vma)) {
+ *name = "[heap]";
+ return;
+ }
+
+ if (vma_is_initial_stack(vma)) {
+ *name = "[stack]";
+ return;
+ }
+
+ if (anon_name) {
+ *name_fmt = "[anon:%s]";
+ *name = anon_name->name;
+ return;
+ }
}
static void show_vma_header_prefix(struct seq_file *m,
@@ -270,17 +462,17 @@ static void show_vma_header_prefix(struct seq_file *m,
static void
show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
{
- struct mm_struct *mm = vma->vm_mm;
- struct file *file = vma->vm_file;
+ const struct path *path;
+ const char *name_fmt, *name;
vm_flags_t flags = vma->vm_flags;
unsigned long ino = 0;
unsigned long long pgoff = 0;
unsigned long start, end;
dev_t dev = 0;
- const char *name = NULL;
- if (file) {
- struct inode *inode = file_inode(vma->vm_file);
+ if (vma->vm_file) {
+ const struct inode *inode = file_user_inode(vma->vm_file);
+
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
@@ -290,41 +482,14 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
end = vma->vm_end;
show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
- /*
- * Print the dentry name for named mappings, and a
- * special [heap] marker for the heap:
- */
- if (file) {
+ get_vma_name(vma, &path, &name, &name_fmt);
+ if (path) {
seq_pad(m, ' ');
- seq_file_path(m, file, "\n");
- goto done;
- }
-
- if (vma->vm_ops && vma->vm_ops->name) {
- name = vma->vm_ops->name(vma);
- if (name)
- goto done;
- }
-
- name = arch_vma_name(vma);
- if (!name) {
- if (!mm) {
- name = "[vdso]";
- goto done;
- }
-
- if (vma->vm_start <= mm->brk &&
- vma->vm_end >= mm->start_brk) {
- name = "[heap]";
- goto done;
- }
-
- if (is_stack(vma))
- name = "[stack]";
- }
-
-done:
- if (name) {
+ seq_path(m, path, "\n");
+ } else if (name_fmt) {
+ seq_pad(m, ' ');
+ seq_printf(m, name_fmt, name);
+ } else if (name) {
seq_pad(m, ' ');
seq_puts(m, name);
}
@@ -349,11 +514,315 @@ static int pid_maps_open(struct inode *inode, struct file *file)
return do_maps_open(inode, file, &proc_pid_maps_op);
}
+#define PROCMAP_QUERY_VMA_FLAGS ( \
+ PROCMAP_QUERY_VMA_READABLE | \
+ PROCMAP_QUERY_VMA_WRITABLE | \
+ PROCMAP_QUERY_VMA_EXECUTABLE | \
+ PROCMAP_QUERY_VMA_SHARED \
+)
+
+#define PROCMAP_QUERY_VALID_FLAGS_MASK ( \
+ PROCMAP_QUERY_COVERING_OR_NEXT_VMA | \
+ PROCMAP_QUERY_FILE_BACKED_VMA | \
+ PROCMAP_QUERY_VMA_FLAGS \
+)
+
+#ifdef CONFIG_PER_VMA_LOCK
+
+static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx)
+{
+ reset_lock_ctx(lock_ctx);
+
+ return 0;
+}
+
+static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx)
+{
+ if (lock_ctx->mmap_locked) {
+ mmap_read_unlock(lock_ctx->mm);
+ lock_ctx->mmap_locked = false;
+ } else {
+ unlock_ctx_vma(lock_ctx);
+ }
+}
+
+static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx,
+ unsigned long addr)
+{
+ struct mm_struct *mm = lock_ctx->mm;
+ struct vm_area_struct *vma;
+ struct vma_iterator vmi;
+
+ if (lock_ctx->mmap_locked)
+ return find_vma(mm, addr);
+
+ /* Unlock previously locked VMA and find the next one under RCU */
+ unlock_ctx_vma(lock_ctx);
+ rcu_read_lock();
+ vma_iter_init(&vmi, mm, addr);
+ vma = lock_next_vma(mm, &vmi, addr);
+ rcu_read_unlock();
+
+ if (!vma)
+ return NULL;
+
+ if (!IS_ERR(vma)) {
+ lock_ctx->locked_vma = vma;
+ return vma;
+ }
+
+ if (PTR_ERR(vma) == -EAGAIN) {
+ /* Fallback to mmap_lock on vma->vm_refcnt overflow */
+ mmap_read_lock(mm);
+ vma = find_vma(mm, addr);
+ lock_ctx->mmap_locked = true;
+ }
+
+ return vma;
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx)
+{
+ return mmap_read_lock_killable(lock_ctx->mm);
+}
+
+static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx)
+{
+ mmap_read_unlock(lock_ctx->mm);
+}
+
+static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx,
+ unsigned long addr)
+{
+ return find_vma(lock_ctx->mm, addr);
+}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
+static struct vm_area_struct *query_matching_vma(struct proc_maps_locking_ctx *lock_ctx,
+ unsigned long addr, u32 flags)
+{
+ struct vm_area_struct *vma;
+
+next_vma:
+ vma = query_vma_find_by_addr(lock_ctx, addr);
+ if (IS_ERR(vma))
+ return vma;
+
+ if (!vma)
+ goto no_vma;
+
+ /* user requested only file-backed VMA, keep iterating */
+ if ((flags & PROCMAP_QUERY_FILE_BACKED_VMA) && !vma->vm_file)
+ goto skip_vma;
+
+ /* VMA permissions should satisfy query flags */
+ if (flags & PROCMAP_QUERY_VMA_FLAGS) {
+ u32 perm = 0;
+
+ if (flags & PROCMAP_QUERY_VMA_READABLE)
+ perm |= VM_READ;
+ if (flags & PROCMAP_QUERY_VMA_WRITABLE)
+ perm |= VM_WRITE;
+ if (flags & PROCMAP_QUERY_VMA_EXECUTABLE)
+ perm |= VM_EXEC;
+ if (flags & PROCMAP_QUERY_VMA_SHARED)
+ perm |= VM_MAYSHARE;
+
+ if ((vma->vm_flags & perm) != perm)
+ goto skip_vma;
+ }
+
+ /* found covering VMA or user is OK with the matching next VMA */
+ if ((flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) || vma->vm_start <= addr)
+ return vma;
+
+skip_vma:
+ /*
+ * If the user needs closest matching VMA, keep iterating.
+ */
+ addr = vma->vm_end;
+ if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA)
+ goto next_vma;
+
+no_vma:
+ return ERR_PTR(-ENOENT);
+}
+
+static int do_procmap_query(struct mm_struct *mm, void __user *uarg)
+{
+ struct proc_maps_locking_ctx lock_ctx = { .mm = mm };
+ struct procmap_query karg;
+ struct vm_area_struct *vma;
+ const char *name = NULL;
+ char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL;
+ __u64 usize;
+ int err;
+
+ if (copy_from_user(&usize, (void __user *)uarg, sizeof(usize)))
+ return -EFAULT;
+ /* argument struct can never be that large, reject abuse */
+ if (usize > PAGE_SIZE)
+ return -E2BIG;
+ /* argument struct should have at least query_flags and query_addr fields */
+ if (usize < offsetofend(struct procmap_query, query_addr))
+ return -EINVAL;
+ err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize);
+ if (err)
+ return err;
+
+ /* reject unknown flags */
+ if (karg.query_flags & ~PROCMAP_QUERY_VALID_FLAGS_MASK)
+ return -EINVAL;
+ /* either both buffer address and size are set, or both should be zero */
+ if (!!karg.vma_name_size != !!karg.vma_name_addr)
+ return -EINVAL;
+ if (!!karg.build_id_size != !!karg.build_id_addr)
+ return -EINVAL;
+
+ if (!mm || !mmget_not_zero(mm))
+ return -ESRCH;
+
+ err = query_vma_setup(&lock_ctx);
+ if (err) {
+ mmput(mm);
+ return err;
+ }
+
+ vma = query_matching_vma(&lock_ctx, karg.query_addr, karg.query_flags);
+ if (IS_ERR(vma)) {
+ err = PTR_ERR(vma);
+ vma = NULL;
+ goto out;
+ }
+
+ karg.vma_start = vma->vm_start;
+ karg.vma_end = vma->vm_end;
+
+ karg.vma_flags = 0;
+ if (vma->vm_flags & VM_READ)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_READABLE;
+ if (vma->vm_flags & VM_WRITE)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_WRITABLE;
+ if (vma->vm_flags & VM_EXEC)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_EXECUTABLE;
+ if (vma->vm_flags & VM_MAYSHARE)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_SHARED;
+
+ karg.vma_page_size = vma_kernel_pagesize(vma);
+
+ if (vma->vm_file) {
+ const struct inode *inode = file_user_inode(vma->vm_file);
+
+ karg.vma_offset = ((__u64)vma->vm_pgoff) << PAGE_SHIFT;
+ karg.dev_major = MAJOR(inode->i_sb->s_dev);
+ karg.dev_minor = MINOR(inode->i_sb->s_dev);
+ karg.inode = inode->i_ino;
+ } else {
+ karg.vma_offset = 0;
+ karg.dev_major = 0;
+ karg.dev_minor = 0;
+ karg.inode = 0;
+ }
+
+ if (karg.build_id_size) {
+ __u32 build_id_sz;
+
+ err = build_id_parse(vma, build_id_buf, &build_id_sz);
+ if (err) {
+ karg.build_id_size = 0;
+ } else {
+ if (karg.build_id_size < build_id_sz) {
+ err = -ENAMETOOLONG;
+ goto out;
+ }
+ karg.build_id_size = build_id_sz;
+ }
+ }
+
+ if (karg.vma_name_size) {
+ size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size);
+ const struct path *path;
+ const char *name_fmt;
+ size_t name_sz = 0;
+
+ get_vma_name(vma, &path, &name, &name_fmt);
+
+ if (path || name_fmt || name) {
+ name_buf = kmalloc(name_buf_sz, GFP_KERNEL);
+ if (!name_buf) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+ if (path) {
+ name = d_path(path, name_buf, name_buf_sz);
+ if (IS_ERR(name)) {
+ err = PTR_ERR(name);
+ goto out;
+ }
+ name_sz = name_buf + name_buf_sz - name;
+ } else if (name || name_fmt) {
+ name_sz = 1 + snprintf(name_buf, name_buf_sz, name_fmt ?: "%s", name);
+ name = name_buf;
+ }
+ if (name_sz > name_buf_sz) {
+ err = -ENAMETOOLONG;
+ goto out;
+ }
+ karg.vma_name_size = name_sz;
+ }
+
+ /* unlock vma or mmap_lock, and put mm_struct before copying data to user */
+ query_vma_teardown(&lock_ctx);
+ mmput(mm);
+
+ if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr),
+ name, karg.vma_name_size)) {
+ kfree(name_buf);
+ return -EFAULT;
+ }
+ kfree(name_buf);
+
+ if (karg.build_id_size && copy_to_user(u64_to_user_ptr(karg.build_id_addr),
+ build_id_buf, karg.build_id_size))
+ return -EFAULT;
+
+ if (copy_to_user(uarg, &karg, min_t(size_t, sizeof(karg), usize)))
+ return -EFAULT;
+
+ return 0;
+
+out:
+ query_vma_teardown(&lock_ctx);
+ mmput(mm);
+ kfree(name_buf);
+ return err;
+}
+
+static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct seq_file *seq = file->private_data;
+ struct proc_maps_private *priv = seq->private;
+
+ switch (cmd) {
+ case PROCMAP_QUERY:
+ /* priv->lock_ctx.mm is set during file open operation */
+ return do_procmap_query(priv->lock_ctx.mm, (void __user *)arg);
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
const struct file_operations proc_pid_maps_operations = {
.open = pid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = proc_map_release,
+ .unlocked_ioctl = procfs_procmap_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
};
/*
@@ -391,24 +860,25 @@ struct mem_size_stats {
unsigned long swap;
unsigned long shared_hugetlb;
unsigned long private_hugetlb;
+ unsigned long ksm;
u64 pss;
u64 pss_anon;
u64 pss_file;
u64 pss_shmem;
+ u64 pss_dirty;
u64 pss_locked;
u64 swap_pss;
- bool check_shmem_swap;
};
static void smaps_page_accumulate(struct mem_size_stats *mss,
- struct page *page, unsigned long size, unsigned long pss,
+ struct folio *folio, unsigned long size, unsigned long pss,
bool dirty, bool locked, bool private)
{
mss->pss += pss;
- if (PageAnon(page))
+ if (folio_test_anon(folio))
mss->pss_anon += pss;
- else if (PageSwapBacked(page))
+ else if (folio_test_swapbacked(folio))
mss->pss_shmem += pss;
else
mss->pss_file += pss;
@@ -416,7 +886,8 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
if (locked)
mss->pss_locked += pss;
- if (dirty || PageDirty(page)) {
+ if (dirty || folio_test_dirty(folio)) {
+ mss->pss_dirty += pss;
if (private)
mss->private_dirty += size;
else
@@ -430,46 +901,77 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
}
static void smaps_account(struct mem_size_stats *mss, struct page *page,
- bool compound, bool young, bool dirty, bool locked)
+ bool compound, bool young, bool dirty, bool locked,
+ bool present)
{
+ struct folio *folio = page_folio(page);
int i, nr = compound ? compound_nr(page) : 1;
unsigned long size = nr * PAGE_SIZE;
+ bool exclusive;
+ int mapcount;
/*
* First accumulate quantities that depend only on |size| and the type
* of the compound page.
*/
- if (PageAnon(page)) {
+ if (folio_test_anon(folio)) {
mss->anonymous += size;
- if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
+ if (!folio_test_swapbacked(folio) && !dirty &&
+ !folio_test_dirty(folio))
mss->lazyfree += size;
}
+ if (folio_test_ksm(folio))
+ mss->ksm += size;
+
mss->resident += size;
/* Accumulate the size in pages that have been accessed. */
- if (young || page_is_young(page) || PageReferenced(page))
+ if (young || folio_test_young(folio) || folio_test_referenced(folio))
mss->referenced += size;
/*
* Then accumulate quantities that may depend on sharing, or that may
* differ page-by-page.
*
- * page_count(page) == 1 guarantees the page is mapped exactly once.
- * If any subpage of the compound page mapped with PTE it would elevate
- * page_count().
+ * refcount == 1 for present entries guarantees that the folio is mapped
+ * exactly once. For large folios this implies that exactly one
+ * PTE/PMD/... maps (a part of) this folio.
+ *
+ * Treat all non-present entries (where relying on the mapcount and
+ * refcount doesn't make sense) as "maybe shared, but not sure how
+ * often". We treat device private entries as being fake-present.
+ *
+ * Note that it would not be safe to read the mapcount especially for
+ * pages referenced by migration entries, even with the PTL held.
*/
- if (page_count(page) == 1) {
- smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
- locked, true);
+ if (folio_ref_count(folio) == 1 || !present) {
+ smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT,
+ dirty, locked, present);
return;
}
+
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ mapcount = folio_average_page_mapcount(folio);
+ exclusive = !folio_maybe_mapped_shared(folio);
+ }
+
+ /*
+ * We obtain a snapshot of the mapcount. Without holding the folio lock
+ * this snapshot can be slightly wrong as we cannot always read the
+ * mapcount atomically.
+ */
for (i = 0; i < nr; i++, page++) {
- int mapcount = page_mapcount(page);
unsigned long pss = PAGE_SIZE << PSS_SHIFT;
+
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) {
+ mapcount = folio_precise_page_mapcount(folio, page);
+ exclusive = mapcount < 2;
+ }
+
if (mapcount >= 2)
pss /= mapcount;
- smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked,
- mapcount < 2);
+ smaps_page_accumulate(mss, folio, PAGE_SIZE, pss,
+ dirty, locked, exclusive);
}
}
@@ -478,9 +980,11 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end,
__always_unused int depth, struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
- mss->swap += shmem_partial_swap_usage(
- walk->vma->vm_file->f_mapping, addr, end);
+ mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping,
+ linear_page_index(vma, addr),
+ linear_page_index(vma, end));
return 0;
}
@@ -488,6 +992,16 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end,
#define smaps_pte_hole NULL
#endif /* CONFIG_SHMEM */
+static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk)
+{
+#ifdef CONFIG_SHMEM
+ if (walk->ops->pte_hole) {
+ /* depth is not used */
+ smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk);
+ }
+#endif
+}
+
static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct mm_walk *walk)
{
@@ -495,17 +1009,24 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
+ bool present = false, young = false, dirty = false;
+ pte_t ptent = ptep_get(pte);
- if (pte_present(*pte)) {
- page = vm_normal_page(vma, addr, *pte);
- } else if (is_swap_pte(*pte)) {
- swp_entry_t swpent = pte_to_swp_entry(*pte);
+ if (pte_present(ptent)) {
+ page = vm_normal_page(vma, addr, ptent);
+ young = pte_young(ptent);
+ dirty = pte_dirty(ptent);
+ present = true;
+ } else if (pte_none(ptent)) {
+ smaps_pte_hole_lookup(addr, walk);
+ } else {
+ const softleaf_t entry = softleaf_from_pte(ptent);
- if (!non_swap_entry(swpent)) {
+ if (softleaf_is_swap(entry)) {
int mapcount;
mss->swap += PAGE_SIZE;
- mapcount = swp_swapcount(swpent);
+ mapcount = swp_swapcount(entry);
if (mapcount >= 2) {
u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
@@ -514,29 +1035,17 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else {
mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
}
- } else if (is_migration_entry(swpent))
- page = migration_entry_to_page(swpent);
- else if (is_device_private_entry(swpent))
- page = device_private_entry_to_page(swpent);
- } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
- && pte_none(*pte))) {
- page = find_get_entry(vma->vm_file->f_mapping,
- linear_page_index(vma, addr));
- if (!page)
- return;
-
- if (xa_is_value(page))
- mss->swap += PAGE_SIZE;
- else
- put_page(page);
-
- return;
+ } else if (softleaf_has_pfn(entry)) {
+ if (softleaf_is_device_private(entry))
+ present = true;
+ page = softleaf_to_page(entry);
+ }
}
if (!page)
return;
- smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
+ smaps_account(mss, page, false, young, dirty, locked, present);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -547,27 +1056,34 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
+ bool present = false;
+ struct folio *folio;
+ if (pmd_none(*pmd))
+ return;
if (pmd_present(*pmd)) {
- /* FOLL_DUMP will return -EFAULT on huge zero page */
- page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
- } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
- swp_entry_t entry = pmd_to_swp_entry(*pmd);
+ page = vm_normal_page_pmd(vma, addr, *pmd);
+ present = true;
+ } else if (unlikely(thp_migration_supported())) {
+ const softleaf_t entry = softleaf_from_pmd(*pmd);
- if (is_migration_entry(entry))
- page = migration_entry_to_page(entry);
+ if (softleaf_has_pfn(entry))
+ page = softleaf_to_page(entry);
}
if (IS_ERR_OR_NULL(page))
return;
- if (PageAnon(page))
+ folio = page_folio(page);
+ if (folio_test_anon(folio))
mss->anonymous_thp += HPAGE_PMD_SIZE;
- else if (PageSwapBacked(page))
+ else if (folio_test_swapbacked(folio))
mss->shmem_thp += HPAGE_PMD_SIZE;
- else if (is_zone_device_page(page))
+ else if (folio_is_zone_device(folio))
/* pass */;
else
mss->file_thp += HPAGE_PMD_SIZE;
- smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
+
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
+ locked, present);
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -590,14 +1106,11 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
goto out;
}
- if (pmd_trans_unstable(pmd))
- goto out;
- /*
- * The mmap_lock held all the way back in m_start() is what
- * keeps khugepaged out of here and from collapsing things
- * in here.
- */
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
for (; addr != end; pte++, addr += PAGE_SIZE)
smaps_pte_entry(pte, addr, walk);
pte_unmap_unlock(pte - 1, ptl);
@@ -610,8 +1123,15 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
{
/*
* Don't forget to update Documentation/ on changes.
+ *
+ * The length of the second argument of mnemonics[]
+ * needs to be 3 instead of previously set 2
+ * (i.e. from [BITS_PER_LONG][2] to [BITS_PER_LONG][3])
+ * to avoid spurious
+ * -Werror=unterminated-string-initialization warning
+ * with GCC 15
*/
- static const char mnemonics[BITS_PER_LONG][2] = {
+ static const char mnemonics[BITS_PER_LONG][3] = {
/*
* In case if we meet a flag we don't know about.
*/
@@ -627,13 +1147,14 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_MAYSHARE)] = "ms",
[ilog2(VM_GROWSDOWN)] = "gd",
[ilog2(VM_PFNMAP)] = "pf",
- [ilog2(VM_DENYWRITE)] = "dw",
+ [ilog2(VM_MAYBE_GUARD)] = "gu",
[ilog2(VM_LOCKED)] = "lo",
[ilog2(VM_IO)] = "io",
[ilog2(VM_SEQ_READ)] = "sr",
[ilog2(VM_RAND_READ)] = "rr",
[ilog2(VM_DONTCOPY)] = "dc",
[ilog2(VM_DONTEXPAND)] = "de",
+ [ilog2(VM_LOCKONFAULT)] = "lf",
[ilog2(VM_ACCOUNT)] = "ac",
[ilog2(VM_NORESERVE)] = "nr",
[ilog2(VM_HUGETLB)] = "ht",
@@ -653,16 +1174,34 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_MERGEABLE)] = "mg",
[ilog2(VM_UFFD_MISSING)]= "um",
[ilog2(VM_UFFD_WP)] = "uw",
+#ifdef CONFIG_ARM64_MTE
+ [ilog2(VM_MTE)] = "mt",
+ [ilog2(VM_MTE_ALLOWED)] = "",
+#endif
#ifdef CONFIG_ARCH_HAS_PKEYS
/* These come out via ProtectionKey: */
[ilog2(VM_PKEY_BIT0)] = "",
[ilog2(VM_PKEY_BIT1)] = "",
[ilog2(VM_PKEY_BIT2)] = "",
+#if CONFIG_ARCH_PKEY_BITS > 3
[ilog2(VM_PKEY_BIT3)] = "",
-#if VM_PKEY_BIT4
+#endif
+#if CONFIG_ARCH_PKEY_BITS > 4
[ilog2(VM_PKEY_BIT4)] = "",
#endif
#endif /* CONFIG_ARCH_HAS_PKEYS */
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+ [ilog2(VM_UFFD_MINOR)] = "ui",
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK
+ [ilog2(VM_SHADOW_STACK)] = "ss",
+#endif
+#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
+ [ilog2(VM_DROPPABLE)] = "dp",
+#endif
+#ifdef CONFIG_64BIT
+ [ilog2(VM_SEALED)] = "sl",
+#endif
};
size_t i;
@@ -670,11 +1209,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
for (i = 0; i < BITS_PER_LONG; i++) {
if (!mnemonics[i][0])
continue;
- if (vma->vm_flags & (1UL << i)) {
- seq_putc(m, mnemonics[i][0]);
- seq_putc(m, mnemonics[i][1]);
- seq_putc(m, ' ');
- }
+ if (vma->vm_flags & (1UL << i))
+ seq_printf(m, "%s ", mnemonics[i]);
}
seq_putc(m, '\n');
}
@@ -686,26 +1222,32 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
- struct page *page = NULL;
+ struct folio *folio = NULL;
+ bool present = false;
+ spinlock_t *ptl;
+ pte_t ptent;
- if (pte_present(*pte)) {
- page = vm_normal_page(vma, addr, *pte);
- } else if (is_swap_pte(*pte)) {
- swp_entry_t swpent = pte_to_swp_entry(*pte);
+ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
+ ptent = huge_ptep_get(walk->mm, addr, pte);
+ if (pte_present(ptent)) {
+ folio = page_folio(pte_page(ptent));
+ present = true;
+ } else {
+ const softleaf_t entry = softleaf_from_pte(ptent);
- if (is_migration_entry(swpent))
- page = migration_entry_to_page(swpent);
- else if (is_device_private_entry(swpent))
- page = device_private_entry_to_page(swpent);
+ if (softleaf_has_pfn(entry))
+ folio = softleaf_to_folio(entry);
}
- if (page) {
- int mapcount = page_mapcount(page);
- if (mapcount >= 2)
+ if (folio) {
+ /* We treat non-present entries as "maybe shared". */
+ if (!present || folio_maybe_mapped_shared(folio) ||
+ hugetlb_pmd_shared(pte))
mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
else
mss->private_hugetlb += huge_page_size(hstate_vma(vma));
}
+ spin_unlock(ptl);
return 0;
}
#else
@@ -715,20 +1257,31 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
static const struct mm_walk_ops smaps_walk_ops = {
.pmd_entry = smaps_pte_range,
.hugetlb_entry = smaps_hugetlb_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static const struct mm_walk_ops smaps_shmem_walk_ops = {
.pmd_entry = smaps_pte_range,
.hugetlb_entry = smaps_hugetlb_range,
.pte_hole = smaps_pte_hole,
+ .walk_lock = PGWALK_RDLOCK,
};
+/*
+ * Gather mem stats from @vma with the indicated beginning
+ * address @start, and keep them in @mss.
+ *
+ * Use vm_start of @vma as the beginning address if @start is 0.
+ */
static void smap_gather_stats(struct vm_area_struct *vma,
- struct mem_size_stats *mss)
+ struct mem_size_stats *mss, unsigned long start)
{
-#ifdef CONFIG_SHMEM
- /* In case of smaps_rollup, reset the value from previous vma */
- mss->check_shmem_swap = false;
+ const struct mm_walk_ops *ops = &smaps_walk_ops;
+
+ /* Invalid start */
+ if (start >= vma->vm_end)
+ return;
+
if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
/*
* For shared or readonly shmem mappings we know that all
@@ -742,18 +1295,19 @@ static void smap_gather_stats(struct vm_area_struct *vma,
*/
unsigned long shmem_swapped = shmem_swap_usage(vma);
- if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
- !(vma->vm_flags & VM_WRITE)) {
+ if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
+ !(vma->vm_flags & VM_WRITE))) {
mss->swap += shmem_swapped;
} else {
- mss->check_shmem_swap = true;
- walk_page_vma(vma, &smaps_shmem_walk_ops, mss);
- return;
+ ops = &smaps_shmem_walk_ops;
}
}
-#endif
+
/* mmap_lock is held in m_start */
- walk_page_vma(vma, &smaps_walk_ops, mss);
+ if (!start)
+ walk_page_vma(vma, ops, mss);
+ else
+ walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
}
#define SEQ_PUT_DEC(str, val) \
@@ -765,6 +1319,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
{
SEQ_PUT_DEC("Rss: ", mss->resident);
SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT);
+ SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT);
if (rollup_mode) {
/*
* These are meaningful only for smaps_rollup, otherwise two of
@@ -783,10 +1338,11 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty);
SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced);
SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous);
+ SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm);
SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
- SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
+ SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
mss->private_hugetlb >> 10, 7);
@@ -801,11 +1357,9 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
static int show_smap(struct seq_file *m, void *v)
{
struct vm_area_struct *vma = v;
- struct mem_size_stats mss;
-
- memset(&mss, 0, sizeof(mss));
+ struct mem_size_stats mss = {};
- smap_gather_stats(vma, &mss);
+ smap_gather_stats(vma, &mss, 0);
show_map_vma(m, vma);
@@ -816,8 +1370,9 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
- seq_printf(m, "THPeligible: %d\n",
- transparent_hugepage_enabled(vma));
+ seq_printf(m, "THPeligible: %8u\n",
+ !!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS,
+ THP_ORDERS_ALL));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
@@ -829,37 +1384,108 @@ static int show_smap(struct seq_file *m, void *v)
static int show_smaps_rollup(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
- struct mem_size_stats mss;
- struct mm_struct *mm;
+ struct mem_size_stats mss = {};
+ struct mm_struct *mm = priv->lock_ctx.mm;
struct vm_area_struct *vma;
- unsigned long last_vma_end = 0;
+ unsigned long vma_start = 0, last_vma_end = 0;
int ret = 0;
+ VMA_ITERATOR(vmi, mm, 0);
priv->task = get_proc_task(priv->inode);
if (!priv->task)
return -ESRCH;
- mm = priv->mm;
if (!mm || !mmget_not_zero(mm)) {
ret = -ESRCH;
goto out_put_task;
}
- memset(&mss, 0, sizeof(mss));
-
ret = mmap_read_lock_killable(mm);
if (ret)
goto out_put_mm;
hold_task_mempolicy(priv);
+ vma = vma_next(&vmi);
+
+ if (unlikely(!vma))
+ goto empty_set;
- for (vma = priv->mm->mmap; vma; vma = vma->vm_next) {
- smap_gather_stats(vma, &mss);
+ vma_start = vma->vm_start;
+ do {
+ smap_gather_stats(vma, &mss, 0);
last_vma_end = vma->vm_end;
- }
- show_vma_header_prefix(m, priv->mm->mmap->vm_start,
- last_vma_end, 0, 0, 0, 0);
+ /*
+ * Release mmap_lock temporarily if someone wants to
+ * access it for write request.
+ */
+ if (mmap_lock_is_contended(mm)) {
+ vma_iter_invalidate(&vmi);
+ mmap_read_unlock(mm);
+ ret = mmap_read_lock_killable(mm);
+ if (ret) {
+ release_task_mempolicy(priv);
+ goto out_put_mm;
+ }
+
+ /*
+ * After dropping the lock, there are four cases to
+ * consider. See the following example for explanation.
+ *
+ * +------+------+-----------+
+ * | VMA1 | VMA2 | VMA3 |
+ * +------+------+-----------+
+ * | | | |
+ * 4k 8k 16k 400k
+ *
+ * Suppose we drop the lock after reading VMA2 due to
+ * contention, then we get:
+ *
+ * last_vma_end = 16k
+ *
+ * 1) VMA2 is freed, but VMA3 exists:
+ *
+ * vma_next(vmi) will return VMA3.
+ * In this case, just continue from VMA3.
+ *
+ * 2) VMA2 still exists:
+ *
+ * vma_next(vmi) will return VMA3.
+ * In this case, just continue from VMA3.
+ *
+ * 3) No more VMAs can be found:
+ *
+ * vma_next(vmi) will return NULL.
+ * No more things to do, just break.
+ *
+ * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
+ *
+ * vma_next(vmi) will return VMA' whose range
+ * contains last_vma_end.
+ * Iterate VMA' from last_vma_end.
+ */
+ vma = vma_next(&vmi);
+ /* Case 3 above */
+ if (!vma)
+ break;
+
+ /* Case 1 and 2 above */
+ if (vma->vm_start >= last_vma_end) {
+ smap_gather_stats(vma, &mss, 0);
+ last_vma_end = vma->vm_end;
+ continue;
+ }
+
+ /* Case 4 above */
+ if (vma->vm_end > last_vma_end) {
+ smap_gather_stats(vma, &mss, last_vma_end);
+ last_vma_end = vma->vm_end;
+ }
+ }
+ } for_each_vma(vmi, vma);
+
+empty_set:
+ show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0);
seq_pad(m, ' ');
seq_puts(m, "[rollup]\n");
@@ -904,9 +1530,9 @@ static int smaps_rollup_open(struct inode *inode, struct file *file)
goto out_free;
priv->inode = inode;
- priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(priv->mm)) {
- ret = PTR_ERR(priv->mm);
+ priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) {
+ ret = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH;
single_release(inode, file);
goto out_free;
@@ -924,8 +1550,8 @@ static int smaps_rollup_release(struct inode *inode, struct file *file)
struct seq_file *seq = file->private_data;
struct proc_maps_private *priv = seq->private;
- if (priv->mm)
- mmdrop(priv->mm);
+ if (priv->lock_ctx.mm)
+ mmdrop(priv->lock_ctx.mm);
kfree(priv);
return single_release(inode, file);
@@ -958,43 +1584,62 @@ struct clear_refs_private {
enum clear_refs_types type;
};
-#ifdef CONFIG_MEM_SOFT_DIRTY
+static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
+{
+ struct folio *folio;
+
+ if (!pte_write(pte))
+ return false;
+ if (!is_cow_mapping(vma->vm_flags))
+ return false;
+ if (likely(!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm)))
+ return false;
+ folio = vm_normal_folio(vma, addr, pte);
+ if (!folio)
+ return false;
+ return folio_maybe_dma_pinned(folio);
+}
+
static inline void clear_soft_dirty(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte)
{
+ if (!pgtable_supports_soft_dirty())
+ return;
/*
* The soft-dirty tracker uses #PF-s to catch writes
* to pages, so write-protect the pte as well. See the
* Documentation/admin-guide/mm/soft-dirty.rst for full description
* of how soft-dirty works.
*/
- pte_t ptent = *pte;
+ pte_t ptent = ptep_get(pte);
+
+ if (pte_none(ptent))
+ return;
if (pte_present(ptent)) {
pte_t old_pte;
+ if (pte_is_pinned(vma, addr, ptent))
+ return;
old_pte = ptep_modify_prot_start(vma, addr, pte);
ptent = pte_wrprotect(old_pte);
ptent = pte_clear_soft_dirty(ptent);
ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
- } else if (is_swap_pte(ptent)) {
+ } else {
ptent = pte_swp_clear_soft_dirty(ptent);
set_pte_at(vma->vm_mm, addr, pte, ptent);
}
}
-#else
-static inline void clear_soft_dirty(struct vm_area_struct *vma,
- unsigned long addr, pte_t *pte)
-{
-}
-#endif
-#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
pmd_t old, pmd = *pmdp;
+ if (!pgtable_supports_soft_dirty())
+ return;
+
if (pmd_present(pmd)) {
/* See comment in change_huge_pmd() */
old = pmdp_invalidate(vma, addr, pmdp);
@@ -1007,7 +1652,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
pmd = pmd_clear_soft_dirty(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
- } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+ } else if (pmd_is_migration_entry(pmd)) {
pmd = pmd_swp_clear_soft_dirty(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
@@ -1026,7 +1671,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
pte_t *pte, ptent;
spinlock_t *ptl;
- struct page *page;
+ struct folio *folio;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -1038,23 +1683,24 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
if (!pmd_present(*pmd))
goto out;
- page = pmd_page(*pmd);
+ folio = pmd_folio(*pmd);
/* Clear accessed and referenced bits. */
pmdp_test_and_clear_young(vma, addr, pmd);
- test_and_clear_page_young(page);
- ClearPageReferenced(page);
+ folio_test_clear_young(folio);
+ folio_clear_referenced(folio);
out:
spin_unlock(ptl);
return 0;
}
- if (pmd_trans_unstable(pmd))
- return 0;
-
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
for (; addr != end; pte++, addr += PAGE_SIZE) {
- ptent = *pte;
+ ptent = ptep_get(pte);
if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
clear_soft_dirty(vma, addr, pte);
@@ -1064,14 +1710,14 @@ out:
if (!pte_present(ptent))
continue;
- page = vm_normal_page(vma, addr, ptent);
- if (!page)
+ folio = vm_normal_folio(vma, addr, ptent);
+ if (!folio)
continue;
/* Clear accessed and referenced bits. */
ptep_test_and_clear_young(vma, addr, pte);
- test_and_clear_page_young(page);
- ClearPageReferenced(page);
+ folio_test_clear_young(folio);
+ folio_clear_referenced(folio);
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
@@ -1103,21 +1749,20 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end,
static const struct mm_walk_ops clear_refs_walk_ops = {
.pmd_entry = clear_refs_pte_range,
.test_walk = clear_refs_test_walk,
+ .walk_lock = PGWALK_WRLOCK,
};
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
struct mm_struct *mm;
struct vm_area_struct *vma;
enum clear_refs_types type;
- struct mmu_gather tlb;
int itype;
int rv;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
@@ -1134,76 +1779,46 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
return -ESRCH;
mm = get_task_mm(task);
if (mm) {
+ VMA_ITERATOR(vmi, mm, 0);
struct mmu_notifier_range range;
struct clear_refs_private cp = {
.type = type,
};
+ if (mmap_write_lock_killable(mm)) {
+ count = -EINTR;
+ goto out_mm;
+ }
if (type == CLEAR_REFS_MM_HIWATER_RSS) {
- if (mmap_write_lock_killable(mm)) {
- count = -EINTR;
- goto out_mm;
- }
-
/*
* Writing 5 to /proc/pid/clear_refs resets the peak
* resident set size to this mm's current rss value.
*/
reset_mm_hiwater_rss(mm);
- mmap_write_unlock(mm);
- goto out_mm;
+ goto out_unlock;
}
- if (mmap_read_lock_killable(mm)) {
- count = -EINTR;
- goto out_mm;
- }
- tlb_gather_mmu(&tlb, mm, 0, -1);
if (type == CLEAR_REFS_SOFT_DIRTY) {
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!(vma->vm_flags & VM_SOFTDIRTY))
continue;
- mmap_read_unlock(mm);
- if (mmap_write_lock_killable(mm)) {
- count = -EINTR;
- goto out_mm;
- }
- /*
- * Avoid to modify vma->vm_flags
- * without locked ops while the
- * coredump reads the vm_flags.
- */
- if (!mmget_still_valid(mm)) {
- /*
- * Silently return "count"
- * like if get_task_mm()
- * failed. FIXME: should this
- * function have returned
- * -ESRCH if get_task_mm()
- * failed like if
- * get_proc_task() fails?
- */
- mmap_write_unlock(mm);
- goto out_mm;
- }
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- vma->vm_flags &= ~VM_SOFTDIRTY;
- vma_set_page_prot(vma);
- }
- mmap_write_downgrade(mm);
- break;
+ vm_flags_clear(vma, VM_SOFTDIRTY);
+ vma_set_page_prot(vma);
}
+ inc_tlb_flush_pending(mm);
mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
- 0, NULL, mm, 0, -1UL);
+ 0, mm, 0, -1UL);
mmu_notifier_invalidate_range_start(&range);
}
- walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
- &cp);
- if (type == CLEAR_REFS_SOFT_DIRTY)
+ walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp);
+ if (type == CLEAR_REFS_SOFT_DIRTY) {
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb, 0, -1);
- mmap_read_unlock(mm);
+ flush_tlb_mm(mm);
+ dec_tlb_flush_pending(mm);
+ }
+out_unlock:
+ mmap_write_unlock(mm);
out_mm:
mmput(mm);
}
@@ -1235,6 +1850,8 @@ struct pagemapread {
#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
#define PM_SOFT_DIRTY BIT_ULL(55)
#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
+#define PM_UFFD_WP BIT_ULL(57)
+#define PM_GUARD_REGION BIT_ULL(58)
#define PM_FILE BIT_ULL(61)
#define PM_SWAP BIT_ULL(62)
#define PM_PRESENT BIT_ULL(63)
@@ -1246,8 +1863,7 @@ static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
}
-static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
- struct pagemapread *pm)
+static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm)
{
pm->buffer[pm->pos++] = *pme;
if (pm->pos >= pm->len)
@@ -1255,6 +1871,13 @@ static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
return 0;
}
+static bool __folio_page_mapped_exclusively(struct folio *folio, struct page *page)
+{
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ return folio_precise_page_mapcount(folio, page) == 1;
+ return !folio_maybe_mapped_shared(folio);
+}
+
static int pagemap_pte_hole(unsigned long start, unsigned long end,
__always_unused int depth, struct mm_walk *walk)
{
@@ -1274,7 +1897,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
hole_end = end;
for (; addr < hole_end; addr += PAGE_SIZE) {
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
goto out;
}
@@ -1286,7 +1909,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
if (vma->vm_flags & VM_SOFTDIRTY)
pme = make_pme(0, PM_SOFT_DIRTY);
for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
goto out;
}
@@ -1300,6 +1923,10 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
{
u64 frame = 0, flags = 0;
struct page *page = NULL;
+ struct folio *folio;
+
+ if (pte_none(pte))
+ goto out;
if (pte_present(pte)) {
if (pm->show_pfn)
@@ -1308,32 +1935,134 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
page = vm_normal_page(vma, addr, pte);
if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
- } else if (is_swap_pte(pte)) {
- swp_entry_t entry;
+ if (pte_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
+ } else {
+ softleaf_t entry;
+
if (pte_swp_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
- entry = pte_to_swp_entry(pte);
- if (pm->show_pfn)
+ if (pte_swp_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
+ entry = softleaf_from_pte(pte);
+ if (pm->show_pfn) {
+ pgoff_t offset;
+
+ /*
+ * For PFN swap offsets, keeping the offset field
+ * to be PFN only to be compatible with old smaps.
+ */
+ if (softleaf_has_pfn(entry))
+ offset = softleaf_to_pfn(entry);
+ else
+ offset = swp_offset(entry);
frame = swp_type(entry) |
- (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
+ (offset << MAX_SWAPFILES_SHIFT);
+ }
flags |= PM_SWAP;
- if (is_migration_entry(entry))
- page = migration_entry_to_page(entry);
+ if (softleaf_has_pfn(entry))
+ page = softleaf_to_page(entry);
+ if (softleaf_is_uffd_wp_marker(entry))
+ flags |= PM_UFFD_WP;
+ if (softleaf_is_guard_marker(entry))
+ flags |= PM_GUARD_REGION;
+ }
- if (is_device_private_entry(entry))
- page = device_private_entry_to_page(entry);
+ if (page) {
+ folio = page_folio(page);
+ if (!folio_test_anon(folio))
+ flags |= PM_FILE;
+ if ((flags & PM_PRESENT) &&
+ __folio_page_mapped_exclusively(folio, page))
+ flags |= PM_MMAP_EXCLUSIVE;
}
- if (page && !PageAnon(page))
- flags |= PM_FILE;
- if (page && page_mapcount(page) == 1)
- flags |= PM_MMAP_EXCLUSIVE;
+out:
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
return make_pme(frame, flags);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr,
+ unsigned long end, struct vm_area_struct *vma,
+ struct pagemapread *pm)
+{
+ unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT;
+ u64 flags = 0, frame = 0;
+ pmd_t pmd = *pmdp;
+ struct page *page = NULL;
+ struct folio *folio = NULL;
+ int err = 0;
+
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ flags |= PM_SOFT_DIRTY;
+
+ if (pmd_none(pmd))
+ goto populate_pagemap;
+
+ if (pmd_present(pmd)) {
+ page = pmd_page(pmd);
+
+ flags |= PM_PRESENT;
+ if (pmd_soft_dirty(pmd))
+ flags |= PM_SOFT_DIRTY;
+ if (pmd_uffd_wp(pmd))
+ flags |= PM_UFFD_WP;
+ if (pm->show_pfn)
+ frame = pmd_pfn(pmd) + idx;
+ } else if (thp_migration_supported()) {
+ const softleaf_t entry = softleaf_from_pmd(pmd);
+ unsigned long offset;
+
+ if (pm->show_pfn) {
+ if (softleaf_has_pfn(entry))
+ offset = softleaf_to_pfn(entry) + idx;
+ else
+ offset = swp_offset(entry) + idx;
+ frame = swp_type(entry) |
+ (offset << MAX_SWAPFILES_SHIFT);
+ }
+ flags |= PM_SWAP;
+ if (pmd_swp_soft_dirty(pmd))
+ flags |= PM_SOFT_DIRTY;
+ if (pmd_swp_uffd_wp(pmd))
+ flags |= PM_UFFD_WP;
+ VM_WARN_ON_ONCE(!pmd_is_migration_entry(pmd));
+ page = softleaf_to_page(entry);
+ }
+
+ if (page) {
+ folio = page_folio(page);
+ if (!folio_test_anon(folio))
+ flags |= PM_FILE;
+ }
+
+populate_pagemap:
+ for (; addr != end; addr += PAGE_SIZE, idx++) {
+ u64 cur_flags = flags;
+ pagemap_entry_t pme;
+
+ if (folio && (flags & PM_PRESENT) &&
+ __folio_page_mapped_exclusively(folio, page))
+ cur_flags |= PM_MMAP_EXCLUSIVE;
+
+ pme = make_pme(frame, cur_flags);
+ err = add_to_pagemap(&pme, pm);
+ if (err)
+ break;
+ if (pm->show_pfn) {
+ if (flags & PM_PRESENT)
+ frame++;
+ else if (flags & PM_SWAP)
+ frame += (1 << MAX_SWAPFILES_SHIFT);
+ }
+ }
+ return err;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
@@ -1346,76 +2075,26 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
ptl = pmd_trans_huge_lock(pmdp, vma);
if (ptl) {
- u64 flags = 0, frame = 0;
- pmd_t pmd = *pmdp;
- struct page *page = NULL;
-
- if (vma->vm_flags & VM_SOFTDIRTY)
- flags |= PM_SOFT_DIRTY;
-
- if (pmd_present(pmd)) {
- page = pmd_page(pmd);
-
- flags |= PM_PRESENT;
- if (pmd_soft_dirty(pmd))
- flags |= PM_SOFT_DIRTY;
- if (pm->show_pfn)
- frame = pmd_pfn(pmd) +
- ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- }
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
- else if (is_swap_pmd(pmd)) {
- swp_entry_t entry = pmd_to_swp_entry(pmd);
- unsigned long offset;
-
- if (pm->show_pfn) {
- offset = swp_offset(entry) +
- ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- frame = swp_type(entry) |
- (offset << MAX_SWAPFILES_SHIFT);
- }
- flags |= PM_SWAP;
- if (pmd_swp_soft_dirty(pmd))
- flags |= PM_SOFT_DIRTY;
- VM_BUG_ON(!is_pmd_migration_entry(pmd));
- page = migration_entry_to_page(entry);
- }
-#endif
-
- if (page && page_mapcount(page) == 1)
- flags |= PM_MMAP_EXCLUSIVE;
-
- for (; addr != end; addr += PAGE_SIZE) {
- pagemap_entry_t pme = make_pme(frame, flags);
-
- err = add_to_pagemap(addr, &pme, pm);
- if (err)
- break;
- if (pm->show_pfn) {
- if (flags & PM_PRESENT)
- frame++;
- else if (flags & PM_SWAP)
- frame += (1 << MAX_SWAPFILES_SHIFT);
- }
- }
+ err = pagemap_pmd_range_thp(pmdp, addr, end, vma, pm);
spin_unlock(ptl);
return err;
}
-
- if (pmd_trans_unstable(pmdp))
- return 0;
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
/*
* We can assume that @vma always points to a valid one and @end never
* goes beyond vma->vm_end.
*/
orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return err;
+ }
for (; addr < end; pte++, addr += PAGE_SIZE) {
pagemap_entry_t pme;
- pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
- err = add_to_pagemap(addr, &pme, pm);
+ pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte));
+ err = add_to_pagemap(&pme, pm);
if (err)
break;
}
@@ -1435,38 +2114,47 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
struct pagemapread *pm = walk->private;
struct vm_area_struct *vma = walk->vma;
u64 flags = 0, frame = 0;
+ spinlock_t *ptl;
int err = 0;
pte_t pte;
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
- pte = huge_ptep_get(ptep);
+ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, ptep);
+ pte = huge_ptep_get(walk->mm, addr, ptep);
if (pte_present(pte)) {
- struct page *page = pte_page(pte);
+ struct folio *folio = page_folio(pte_page(pte));
- if (!PageAnon(page))
+ if (!folio_test_anon(folio))
flags |= PM_FILE;
- if (page_mapcount(page) == 1)
+ if (!folio_maybe_mapped_shared(folio) &&
+ !hugetlb_pmd_shared(ptep))
flags |= PM_MMAP_EXCLUSIVE;
+ if (huge_pte_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
+
flags |= PM_PRESENT;
if (pm->show_pfn)
frame = pte_pfn(pte) +
((addr & ~hmask) >> PAGE_SHIFT);
+ } else if (pte_swp_uffd_wp_any(pte)) {
+ flags |= PM_UFFD_WP;
}
for (; addr != end; addr += PAGE_SIZE) {
pagemap_entry_t pme = make_pme(frame, flags);
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
- return err;
+ break;
if (pm->show_pfn && (flags & PM_PRESENT))
frame++;
}
+ spin_unlock(ptl);
cond_resched();
return err;
@@ -1479,6 +2167,7 @@ static const struct mm_walk_ops pagemap_ops = {
.pmd_entry = pagemap_pmd_range,
.pte_hole = pagemap_pte_hole,
.hugetlb_entry = pagemap_hugetlb_range,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
@@ -1492,7 +2181,9 @@ static const struct mm_walk_ops pagemap_ops = {
* Bits 5-54 swap offset if swapped
* Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
* Bit 56 page exclusively mapped
- * Bits 57-60 zero
+ * Bit 57 pte is uffd-wp write-protected
+ * Bit 58 pte is a guard region
+ * Bits 59-60 zero
* Bit 61 page is file-page or shared-anon
* Bit 62 page swapped
* Bit 63 page present
@@ -1541,19 +2232,28 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
src = *ppos;
svpfn = src / PM_ENTRY_BYTES;
- start_vaddr = svpfn << PAGE_SHIFT;
end_vaddr = mm->task_size;
/* watch out for wraparound */
- if (svpfn > mm->task_size >> PAGE_SHIFT)
+ start_vaddr = end_vaddr;
+ if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) {
+ unsigned long end;
+
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ goto out_free;
+ start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT);
+ mmap_read_unlock(mm);
+
+ end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT);
+ if (end >= start_vaddr && end < mm->task_size)
+ end_vaddr = end;
+ }
+
+ /* Ensure the address is inside the task */
+ if (start_vaddr > mm->task_size)
start_vaddr = end_vaddr;
- /*
- * The odds are that this will stop walking way
- * before end_vaddr, because the length of the
- * user buffer is tracked in "pm", and the walk
- * will stop when we hit the end of the buffer.
- */
ret = 0;
while (count && (start_vaddr < end_vaddr)) {
int len;
@@ -1597,8 +2297,8 @@ static int pagemap_open(struct inode *inode, struct file *file)
struct mm_struct *mm;
mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(mm))
- return PTR_ERR(mm);
+ if (IS_ERR_OR_NULL(mm))
+ return mm ? PTR_ERR(mm) : -ESRCH;
file->private_data = mm;
return 0;
}
@@ -1612,11 +2312,794 @@ static int pagemap_release(struct inode *inode, struct file *file)
return 0;
}
+#define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \
+ PAGE_IS_FILE | PAGE_IS_PRESENT | \
+ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \
+ PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \
+ PAGE_IS_GUARD)
+#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
+
+struct pagemap_scan_private {
+ struct pm_scan_arg arg;
+ unsigned long masks_of_interest, cur_vma_category;
+ struct page_region *vec_buf;
+ unsigned long vec_buf_len, vec_buf_index, found_pages;
+ struct page_region __user *vec_out;
+};
+
+static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
+ struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte)
+{
+ unsigned long categories;
+
+ if (pte_none(pte))
+ return 0;
+
+ if (pte_present(pte)) {
+ struct page *page;
+
+ categories = PAGE_IS_PRESENT;
+
+ if (!pte_uffd_wp(pte))
+ categories |= PAGE_IS_WRITTEN;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ page = vm_normal_page(vma, addr, pte);
+ if (page && !PageAnon(page))
+ categories |= PAGE_IS_FILE;
+ }
+
+ if (is_zero_pfn(pte_pfn(pte)))
+ categories |= PAGE_IS_PFNZERO;
+ if (pte_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ } else {
+ softleaf_t entry;
+
+ categories = PAGE_IS_SWAPPED;
+
+ if (!pte_swp_uffd_wp_any(pte))
+ categories |= PAGE_IS_WRITTEN;
+
+ entry = softleaf_from_pte(pte);
+ if (softleaf_is_guard_marker(entry))
+ categories |= PAGE_IS_GUARD;
+ else if ((p->masks_of_interest & PAGE_IS_FILE) &&
+ softleaf_has_pfn(entry) &&
+ !folio_test_anon(softleaf_to_folio(entry)))
+ categories |= PAGE_IS_FILE;
+
+ if (pte_swp_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ }
+
+ return categories;
+}
+
+static void make_uffd_wp_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte, pte_t ptent)
+{
+ if (pte_present(ptent)) {
+ pte_t old_pte;
+
+ old_pte = ptep_modify_prot_start(vma, addr, pte);
+ ptent = pte_mkuffd_wp(old_pte);
+ ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
+ } else if (pte_none(ptent)) {
+ set_pte_at(vma->vm_mm, addr, pte,
+ make_pte_marker(PTE_MARKER_UFFD_WP));
+ } else {
+ ptent = pte_swp_mkuffd_wp(ptent);
+ set_pte_at(vma->vm_mm, addr, pte, ptent);
+ }
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
+ struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd)
+{
+ unsigned long categories = PAGE_IS_HUGE;
+
+ if (pmd_none(pmd))
+ return categories;
+
+ if (pmd_present(pmd)) {
+ struct page *page;
+
+ categories |= PAGE_IS_PRESENT;
+ if (!pmd_uffd_wp(pmd))
+ categories |= PAGE_IS_WRITTEN;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ page = vm_normal_page_pmd(vma, addr, pmd);
+ if (page && !PageAnon(page))
+ categories |= PAGE_IS_FILE;
+ }
+
+ if (is_huge_zero_pmd(pmd))
+ categories |= PAGE_IS_PFNZERO;
+ if (pmd_soft_dirty(pmd))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ } else {
+ categories |= PAGE_IS_SWAPPED;
+ if (!pmd_swp_uffd_wp(pmd))
+ categories |= PAGE_IS_WRITTEN;
+ if (pmd_swp_soft_dirty(pmd))
+ categories |= PAGE_IS_SOFT_DIRTY;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ const softleaf_t entry = softleaf_from_pmd(pmd);
+
+ if (softleaf_has_pfn(entry) &&
+ !folio_test_anon(softleaf_to_folio(entry)))
+ categories |= PAGE_IS_FILE;
+ }
+ }
+
+ return categories;
+}
+
+static void make_uffd_wp_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t old, pmd = *pmdp;
+
+ if (pmd_present(pmd)) {
+ old = pmdp_invalidate_ad(vma, addr, pmdp);
+ pmd = pmd_mkuffd_wp(old);
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ } else if (pmd_is_migration_entry(pmd)) {
+ pmd = pmd_swp_mkuffd_wp(pmd);
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long pagemap_hugetlb_category(pte_t pte)
+{
+ unsigned long categories = PAGE_IS_HUGE;
+
+ if (pte_none(pte))
+ return categories;
+
+ /*
+ * According to pagemap_hugetlb_range(), file-backed HugeTLB
+ * page cannot be swapped. So PAGE_IS_FILE is not checked for
+ * swapped pages.
+ */
+ if (pte_present(pte)) {
+ categories |= PAGE_IS_PRESENT;
+
+ if (!huge_pte_uffd_wp(pte))
+ categories |= PAGE_IS_WRITTEN;
+ if (!PageAnon(pte_page(pte)))
+ categories |= PAGE_IS_FILE;
+ if (is_zero_pfn(pte_pfn(pte)))
+ categories |= PAGE_IS_PFNZERO;
+ if (pte_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ } else {
+ categories |= PAGE_IS_SWAPPED;
+
+ if (!pte_swp_uffd_wp_any(pte))
+ categories |= PAGE_IS_WRITTEN;
+ if (pte_swp_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ }
+
+ return categories;
+}
+
+static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t ptent)
+{
+ const unsigned long psize = huge_page_size(hstate_vma(vma));
+ softleaf_t entry;
+
+ if (huge_pte_none(ptent)) {
+ set_huge_pte_at(vma->vm_mm, addr, ptep,
+ make_pte_marker(PTE_MARKER_UFFD_WP), psize);
+ return;
+ }
+
+ entry = softleaf_from_pte(ptent);
+ if (softleaf_is_hwpoison(entry) || softleaf_is_marker(entry))
+ return;
+
+ if (softleaf_is_migration(entry))
+ set_huge_pte_at(vma->vm_mm, addr, ptep,
+ pte_swp_mkuffd_wp(ptent), psize);
+ else
+ huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
+ huge_pte_mkuffd_wp(ptent));
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
+static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
+ unsigned long addr, unsigned long end)
+{
+ struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+
+ if (!p->vec_buf)
+ return;
+
+ if (cur_buf->start != addr)
+ cur_buf->end = addr;
+ else
+ cur_buf->start = cur_buf->end = 0;
+
+ p->found_pages -= (end - addr) / PAGE_SIZE;
+}
+#endif
+
+static bool pagemap_scan_is_interesting_page(unsigned long categories,
+ const struct pagemap_scan_private *p)
+{
+ categories ^= p->arg.category_inverted;
+ if ((categories & p->arg.category_mask) != p->arg.category_mask)
+ return false;
+ if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask))
+ return false;
+
+ return true;
+}
+
+static bool pagemap_scan_is_interesting_vma(unsigned long categories,
+ const struct pagemap_scan_private *p)
+{
+ unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED;
+
+ categories ^= p->arg.category_inverted;
+ if ((categories & required) != required)
+ return false;
+
+ return true;
+}
+
+static int pagemap_scan_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long vma_category = 0;
+ bool wp_allowed = userfaultfd_wp_async(vma) &&
+ userfaultfd_wp_use_markers(vma);
+
+ if (!wp_allowed) {
+ /* User requested explicit failure over wp-async capability */
+ if (p->arg.flags & PM_SCAN_CHECK_WPASYNC)
+ return -EPERM;
+ /*
+ * User requires wr-protect, and allows silently skipping
+ * unsupported vmas.
+ */
+ if (p->arg.flags & PM_SCAN_WP_MATCHING)
+ return 1;
+ /*
+ * Then the request doesn't involve wr-protects at all,
+ * fall through to the rest checks, and allow vma walk.
+ */
+ }
+
+ if (vma->vm_flags & VM_PFNMAP)
+ return 1;
+
+ if (wp_allowed)
+ vma_category |= PAGE_IS_WPALLOWED;
+
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ vma_category |= PAGE_IS_SOFT_DIRTY;
+
+ if (!pagemap_scan_is_interesting_vma(vma_category, p))
+ return 1;
+
+ p->cur_vma_category = vma_category;
+
+ return 0;
+}
+
+static bool pagemap_scan_push_range(unsigned long categories,
+ struct pagemap_scan_private *p,
+ unsigned long addr, unsigned long end)
+{
+ struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+
+ /*
+ * When there is no output buffer provided at all, the sentinel values
+ * won't match here. There is no other way for `cur_buf->end` to be
+ * non-zero other than it being non-empty.
+ */
+ if (addr == cur_buf->end && categories == cur_buf->categories) {
+ cur_buf->end = end;
+ return true;
+ }
+
+ if (cur_buf->end) {
+ if (p->vec_buf_index >= p->vec_buf_len - 1)
+ return false;
+
+ cur_buf = &p->vec_buf[++p->vec_buf_index];
+ }
+
+ cur_buf->start = addr;
+ cur_buf->end = end;
+ cur_buf->categories = categories;
+
+ return true;
+}
+
+static int pagemap_scan_output(unsigned long categories,
+ struct pagemap_scan_private *p,
+ unsigned long addr, unsigned long *end)
+{
+ unsigned long n_pages, total_pages;
+ int ret = 0;
+
+ if (!p->vec_buf)
+ return 0;
+
+ categories &= p->arg.return_mask;
+
+ n_pages = (*end - addr) / PAGE_SIZE;
+ if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
+ total_pages > p->arg.max_pages) {
+ size_t n_too_much = total_pages - p->arg.max_pages;
+ *end -= n_too_much * PAGE_SIZE;
+ n_pages -= n_too_much;
+ ret = -ENOSPC;
+ }
+
+ if (!pagemap_scan_push_range(categories, p, addr, *end)) {
+ *end = addr;
+ n_pages = 0;
+ ret = -ENOSPC;
+ }
+
+ p->found_pages += n_pages;
+ if (ret)
+ p->arg.walk_end = *end;
+
+ return ret;
+}
+
+static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
+ unsigned long end, struct mm_walk *walk)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long categories;
+ spinlock_t *ptl;
+ int ret = 0;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return -ENOENT;
+
+ categories = p->cur_vma_category |
+ pagemap_thp_category(p, vma, start, *pmd);
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ goto out_unlock;
+
+ ret = pagemap_scan_output(categories, p, start, &end);
+ if (start == end)
+ goto out_unlock;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ goto out_unlock;
+ if (~categories & PAGE_IS_WRITTEN)
+ goto out_unlock;
+
+ /*
+ * Break huge page into small pages if the WP operation
+ * needs to be performed on a portion of the huge page.
+ */
+ if (end != start + HPAGE_SIZE) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, start);
+ pagemap_scan_backout_range(p, start, end);
+ /* Report as if there was no THP */
+ return -ENOENT;
+ }
+
+ make_uffd_wp_pmd(vma, start, pmd);
+ flush_tlb_range(vma, start, end);
+out_unlock:
+ spin_unlock(ptl);
+ return ret;
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+ return -ENOENT;
+#endif
+}
+
+static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long addr, flush_end = 0;
+ pte_t *pte, *start_pte;
+ spinlock_t *ptl;
+ int ret;
+
+ ret = pagemap_scan_thp_entry(pmd, start, end, walk);
+ if (ret != -ENOENT)
+ return ret;
+
+ ret = 0;
+ start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+
+ arch_enter_lazy_mmu_mode();
+
+ if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
+ /* Fast path for performing exclusive WP */
+ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+ pte_t ptent = ptep_get(pte);
+
+ if ((pte_present(ptent) && pte_uffd_wp(ptent)) ||
+ pte_swp_uffd_wp_any(ptent))
+ continue;
+ make_uffd_wp_pte(vma, addr, pte, ptent);
+ if (!flush_end)
+ start = addr;
+ flush_end = addr + PAGE_SIZE;
+ }
+ goto flush_and_return;
+ }
+
+ if (!p->arg.category_anyof_mask && !p->arg.category_inverted &&
+ p->arg.category_mask == PAGE_IS_WRITTEN &&
+ p->arg.return_mask == PAGE_IS_WRITTEN) {
+ for (addr = start; addr < end; pte++, addr += PAGE_SIZE) {
+ unsigned long next = addr + PAGE_SIZE;
+ pte_t ptent = ptep_get(pte);
+
+ if ((pte_present(ptent) && pte_uffd_wp(ptent)) ||
+ pte_swp_uffd_wp_any(ptent))
+ continue;
+ ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN,
+ p, addr, &next);
+ if (next == addr)
+ break;
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ continue;
+ make_uffd_wp_pte(vma, addr, pte, ptent);
+ if (!flush_end)
+ start = addr;
+ flush_end = next;
+ }
+ goto flush_and_return;
+ }
+
+ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+ pte_t ptent = ptep_get(pte);
+ unsigned long categories = p->cur_vma_category |
+ pagemap_page_category(p, vma, addr, ptent);
+ unsigned long next = addr + PAGE_SIZE;
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ continue;
+
+ ret = pagemap_scan_output(categories, p, addr, &next);
+ if (next == addr)
+ break;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ continue;
+ if (~categories & PAGE_IS_WRITTEN)
+ continue;
+
+ make_uffd_wp_pte(vma, addr, pte, ptent);
+ if (!flush_end)
+ start = addr;
+ flush_end = next;
+ }
+
+flush_and_return:
+ if (flush_end)
+ flush_tlb_range(vma, start, addr);
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+
+ cond_resched();
+ return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
+ unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long categories;
+ spinlock_t *ptl;
+ int ret = 0;
+ pte_t pte;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
+ /* Go the short route when not write-protecting pages. */
+
+ pte = huge_ptep_get(walk->mm, start, ptep);
+ categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ return 0;
+
+ return pagemap_scan_output(categories, p, start, &end);
+ }
+
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
+
+ pte = huge_ptep_get(walk->mm, start, ptep);
+ categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ goto out_unlock;
+
+ ret = pagemap_scan_output(categories, p, start, &end);
+ if (start == end)
+ goto out_unlock;
+
+ if (~categories & PAGE_IS_WRITTEN)
+ goto out_unlock;
+
+ if (end != start + HPAGE_SIZE) {
+ /* Partial HugeTLB page WP isn't possible. */
+ pagemap_scan_backout_range(p, start, end);
+ p->arg.walk_end = start;
+ ret = 0;
+ goto out_unlock;
+ }
+
+ make_uffd_wp_huge_pte(vma, start, ptep, pte);
+ flush_hugetlb_tlb_range(vma, start, end);
+
+out_unlock:
+ spin_unlock(ptl);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+
+ return ret;
+}
+#else
+#define pagemap_scan_hugetlb_entry NULL
+#endif
+
+static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
+ int depth, struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ int ret, err;
+
+ if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p))
+ return 0;
+
+ ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end);
+ if (addr == end)
+ return ret;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ return ret;
+
+ err = uffd_wp_range(vma, addr, end - addr, true);
+ if (err < 0)
+ ret = err;
+
+ return ret;
+}
+
+static const struct mm_walk_ops pagemap_scan_ops = {
+ .test_walk = pagemap_scan_test_walk,
+ .pmd_entry = pagemap_scan_pmd_entry,
+ .pte_hole = pagemap_scan_pte_hole,
+ .hugetlb_entry = pagemap_scan_hugetlb_entry,
+};
+
+static int pagemap_scan_get_args(struct pm_scan_arg *arg,
+ unsigned long uarg)
+{
+ if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg)))
+ return -EFAULT;
+
+ if (arg->size != sizeof(struct pm_scan_arg))
+ return -EINVAL;
+
+ /* Validate requested features */
+ if (arg->flags & ~PM_SCAN_FLAGS)
+ return -EINVAL;
+ if ((arg->category_inverted | arg->category_mask |
+ arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES)
+ return -EINVAL;
+
+ arg->start = untagged_addr((unsigned long)arg->start);
+ arg->end = untagged_addr((unsigned long)arg->end);
+ arg->vec = untagged_addr((unsigned long)arg->vec);
+
+ /* Validate memory pointers */
+ if (!IS_ALIGNED(arg->start, PAGE_SIZE))
+ return -EINVAL;
+ if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start))
+ return -EFAULT;
+ if (!arg->vec && arg->vec_len)
+ return -EINVAL;
+ if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX)
+ return -EINVAL;
+ if (arg->vec && !access_ok((void __user *)(long)arg->vec,
+ size_mul(arg->vec_len, sizeof(struct page_region))))
+ return -EFAULT;
+
+ /* Fixup default values */
+ arg->end = ALIGN(arg->end, PAGE_SIZE);
+ arg->walk_end = 0;
+ if (!arg->max_pages)
+ arg->max_pages = ULONG_MAX;
+
+ return 0;
+}
+
+static int pagemap_scan_writeback_args(struct pm_scan_arg *arg,
+ unsigned long uargl)
+{
+ struct pm_scan_arg __user *uarg = (void __user *)uargl;
+
+ if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
+{
+ if (!p->arg.vec_len)
+ return 0;
+
+ p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
+ p->arg.vec_len);
+ p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
+ GFP_KERNEL);
+ if (!p->vec_buf)
+ return -ENOMEM;
+
+ p->vec_buf->start = p->vec_buf->end = 0;
+ p->vec_out = (struct page_region __user *)(long)p->arg.vec;
+
+ return 0;
+}
+
+static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
+{
+ const struct page_region *buf = p->vec_buf;
+ long n = p->vec_buf_index;
+
+ if (!p->vec_buf)
+ return 0;
+
+ if (buf[n].end != buf[n].start)
+ n++;
+
+ if (!n)
+ return 0;
+
+ if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
+ return -EFAULT;
+
+ p->arg.vec_len -= n;
+ p->vec_out += n;
+
+ p->vec_buf_index = 0;
+ p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len);
+ p->vec_buf->start = p->vec_buf->end = 0;
+
+ return n;
+}
+
+static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
+{
+ struct pagemap_scan_private p = {0};
+ unsigned long walk_start;
+ size_t n_ranges_out = 0;
+ int ret;
+
+ ret = pagemap_scan_get_args(&p.arg, uarg);
+ if (ret)
+ return ret;
+
+ p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask |
+ p.arg.return_mask;
+ ret = pagemap_scan_init_bounce_buffer(&p);
+ if (ret)
+ return ret;
+
+ for (walk_start = p.arg.start; walk_start < p.arg.end;
+ walk_start = p.arg.walk_end) {
+ struct mmu_notifier_range range;
+ long n_out;
+
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ break;
+
+ /* Protection change for the range is going to happen. */
+ if (p.arg.flags & PM_SCAN_WP_MATCHING) {
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
+ mm, walk_start, p.arg.end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+
+ ret = walk_page_range(mm, walk_start, p.arg.end,
+ &pagemap_scan_ops, &p);
+
+ if (p.arg.flags & PM_SCAN_WP_MATCHING)
+ mmu_notifier_invalidate_range_end(&range);
+
+ mmap_read_unlock(mm);
+
+ n_out = pagemap_scan_flush_buffer(&p);
+ if (n_out < 0)
+ ret = n_out;
+ else
+ n_ranges_out += n_out;
+
+ if (ret != -ENOSPC)
+ break;
+
+ if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages)
+ break;
+ }
+
+ /* ENOSPC signifies early stop (buffer full) from the walk. */
+ if (!ret || ret == -ENOSPC)
+ ret = n_ranges_out;
+
+ /* The walk_end isn't set when ret is zero */
+ if (!p.arg.walk_end)
+ p.arg.walk_end = p.arg.end;
+ if (pagemap_scan_writeback_args(&p.arg, uarg))
+ ret = -EFAULT;
+
+ kfree(p.vec_buf);
+ return ret;
+}
+
+static long do_pagemap_cmd(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mm_struct *mm = file->private_data;
+
+ switch (cmd) {
+ case PAGEMAP_SCAN:
+ return do_pagemap_scan(mm, arg);
+
+ default:
+ return -EINVAL;
+ }
+}
+
const struct file_operations proc_pagemap_operations = {
.llseek = mem_lseek, /* borrow this */
.read = pagemap_read,
.open = pagemap_open,
.release = pagemap_release,
+ .unlocked_ioctl = do_pagemap_cmd,
+ .compat_ioctl = do_pagemap_cmd,
};
#endif /* CONFIG_PROC_PAGE_MONITOR */
@@ -1641,28 +3124,34 @@ struct numa_maps_private {
static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
unsigned long nr_pages)
{
- int count = page_mapcount(page);
+ struct folio *folio = page_folio(page);
+ int count;
+
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ count = folio_precise_page_mapcount(folio, page);
+ else
+ count = folio_average_page_mapcount(folio);
md->pages += nr_pages;
- if (pte_dirty || PageDirty(page))
+ if (pte_dirty || folio_test_dirty(folio))
md->dirty += nr_pages;
- if (PageSwapCache(page))
+ if (folio_test_swapcache(folio))
md->swapcache += nr_pages;
- if (PageActive(page) || PageUnevictable(page))
+ if (folio_test_active(folio) || folio_test_unevictable(folio))
md->active += nr_pages;
- if (PageWriteback(page))
+ if (folio_test_writeback(folio))
md->writeback += nr_pages;
- if (PageAnon(page))
+ if (folio_test_anon(folio))
md->anon += nr_pages;
if (count > md->mapcount_max)
md->mapcount_max = count;
- md->node[page_to_nid(page)] += nr_pages;
+ md->node[folio_nid(folio)] += nr_pages;
}
static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
@@ -1675,7 +3164,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
return NULL;
page = vm_normal_page(vma, addr, pte);
- if (!page)
+ if (!page || is_zone_device_page(page))
return NULL;
if (PageReserved(page))
@@ -1735,16 +3224,18 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
spin_unlock(ptl);
return 0;
}
-
- if (pmd_trans_unstable(pmd))
- return 0;
#endif
orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
do {
- struct page *page = can_gather_numa_stats(*pte, vma, addr);
+ pte_t ptent = ptep_get(pte);
+ struct page *page = can_gather_numa_stats(ptent, vma, addr);
if (!page)
continue;
- gather_stats(page, md, pte_dirty(*pte), 1);
+ gather_stats(page, md, pte_dirty(ptent), 1);
} while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap_unlock(orig_pte, ptl);
@@ -1755,19 +3246,22 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end, struct mm_walk *walk)
{
- pte_t huge_pte = huge_ptep_get(pte);
+ pte_t huge_pte;
struct numa_maps *md;
struct page *page;
+ spinlock_t *ptl;
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+ huge_pte = huge_ptep_get(walk->mm, addr, pte);
if (!pte_present(huge_pte))
- return 0;
+ goto out;
page = pte_page(huge_pte);
- if (!page)
- return 0;
md = walk->private;
gather_stats(page, md, pte_dirty(huge_pte), 1);
+out:
+ spin_unlock(ptl);
return 0;
}
@@ -1782,6 +3276,7 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
static const struct mm_walk_ops show_numa_ops = {
.hugetlb_entry = gather_hugetlb_stats,
.pmd_entry = gather_pte_stats,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
@@ -1795,8 +3290,9 @@ static int show_numa_map(struct seq_file *m, void *v)
struct numa_maps *md = &numa_priv->md;
struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
- struct mempolicy *pol;
char buffer[64];
+ struct mempolicy *pol;
+ pgoff_t ilx;
int nid;
if (!mm)
@@ -1805,7 +3301,7 @@ static int show_numa_map(struct seq_file *m, void *v)
/* Ensure we start with an empty set of numa_maps statistics. */
memset(md, 0, sizeof(*md));
- pol = __get_vma_policy(vma, vma->vm_start);
+ pol = __get_vma_policy(vma, vma->vm_start, &ilx);
if (pol) {
mpol_to_str(buffer, sizeof(buffer), pol);
mpol_cond_put(pol);
@@ -1817,10 +3313,10 @@ static int show_numa_map(struct seq_file *m, void *v)
if (file) {
seq_puts(m, " file=");
- seq_file_path(m, file, "\n\t= ");
- } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+ seq_path(m, file_user_path(file), "\n\t= ");
+ } else if (vma_is_initial_heap(vma)) {
seq_puts(m, " heap");
- } else if (is_stack(vma)) {
+ } else if (vma_is_initial_stack(vma)) {
seq_puts(m, " stack");
}
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index a6d21fc0033c..d362919f4f68 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -20,15 +20,13 @@
*/
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
struct vm_region *region;
- struct rb_node *p;
unsigned long bytes = 0, sbytes = 0, slack = 0, size;
-
- mmap_read_lock(mm);
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
- vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma) {
bytes += kobjsize(vma);
region = vma->vm_region;
@@ -40,7 +38,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
}
if (atomic_read(&mm->mm_count) > 1 ||
- vma->vm_flags & VM_MAYSHARE) {
+ is_nommu_shared_mapping(vma->vm_flags)) {
sbytes += size;
} else {
bytes += size;
@@ -53,7 +51,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
sbytes += kobjsize(mm);
else
bytes += kobjsize(mm);
-
+
if (current->fs && current->fs->users > 1)
sbytes += kobjsize(current->fs);
else
@@ -71,26 +69,24 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
bytes += kobjsize(current); /* includes kernel stack */
+ mmap_read_unlock(mm);
+
seq_printf(m,
"Mem:\t%8lu bytes\n"
"Slack:\t%8lu bytes\n"
"Shared:\t%8lu bytes\n",
bytes, slack, sbytes);
-
- mmap_read_unlock(mm);
}
unsigned long task_vsize(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- struct rb_node *p;
unsigned long vsize = 0;
mmap_read_lock(mm);
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
- vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ for_each_vma(vmi, vma)
vsize += vma->vm_end - vma->vm_start;
- }
mmap_read_unlock(mm);
return vsize;
}
@@ -99,14 +95,13 @@ unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
struct vm_region *region;
- struct rb_node *p;
unsigned long size = kobjsize(mm);
mmap_read_lock(mm);
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
- vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ for_each_vma(vmi, vma) {
size += kobjsize(vma);
region = vma->vm_region;
if (region) {
@@ -126,19 +121,6 @@ unsigned long task_statm(struct mm_struct *mm,
return size;
}
-static int is_stack(struct vm_area_struct *vma)
-{
- struct mm_struct *mm = vma->vm_mm;
-
- /*
- * We make no effort to guess what a given thread considers to be
- * its "stack". It's not even well-defined for programs written
- * languages like Go.
- */
- return vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack;
-}
-
/*
* display a single VMA to a sequenced file
*/
@@ -175,8 +157,8 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
if (file) {
seq_pad(m, ' ');
- seq_file_path(m, file, "");
- } else if (mm && is_stack(vma)) {
+ seq_path(m, file_user_path(file), "");
+ } else if (mm && vma_is_initial_stack(vma)) {
seq_pad(m, ' ');
seq_puts(m, "[stack]");
}
@@ -190,62 +172,74 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
*/
static int show_map(struct seq_file *m, void *_p)
{
- struct rb_node *p = _p;
+ return nommu_vma_show(m, _p);
+}
+
+static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
+ loff_t *ppos)
+{
+ struct vm_area_struct *vma = vma_next(&priv->iter);
+
+ if (vma) {
+ *ppos = vma->vm_start;
+ } else {
+ *ppos = -1UL;
+ }
- return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
+ return vma;
}
-static void *m_start(struct seq_file *m, loff_t *pos)
+static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
+ unsigned long last_addr = *ppos;
struct mm_struct *mm;
- struct rb_node *p;
- loff_t n = *pos;
+
+ /* See proc_get_vma(). Zero at the start or after lseek. */
+ if (last_addr == -1UL)
+ return NULL;
/* pin the task and mm whilst we play with them */
priv->task = get_proc_task(priv->inode);
if (!priv->task)
return ERR_PTR(-ESRCH);
- mm = priv->mm;
- if (!mm || !mmget_not_zero(mm))
+ mm = priv->lock_ctx.mm;
+ if (!mm || !mmget_not_zero(mm)) {
+ put_task_struct(priv->task);
+ priv->task = NULL;
return NULL;
+ }
if (mmap_read_lock_killable(mm)) {
mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
return ERR_PTR(-EINTR);
}
- /* start from the Nth VMA */
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
- if (n-- == 0)
- return p;
+ vma_iter_init(&priv->iter, mm, last_addr);
- mmap_read_unlock(mm);
- mmput(mm);
- return NULL;
+ return proc_get_vma(priv, ppos);
}
-static void m_stop(struct seq_file *m, void *_vml)
+static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
+ struct mm_struct *mm = priv->lock_ctx.mm;
- if (!IS_ERR_OR_NULL(_vml)) {
- mmap_read_unlock(priv->mm);
- mmput(priv->mm);
- }
- if (priv->task) {
- put_task_struct(priv->task);
- priv->task = NULL;
- }
+ if (!priv->task)
+ return;
+
+ mmap_read_unlock(mm);
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
}
-static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
+static void *m_next(struct seq_file *m, void *_p, loff_t *ppos)
{
- struct rb_node *p = _p;
-
- (*pos)++;
- return p ? rb_next(p) : NULL;
+ return proc_get_vma(m->private, ppos);
}
static const struct seq_operations proc_pid_maps_ops = {
@@ -265,9 +259,9 @@ static int maps_open(struct inode *inode, struct file *file,
return -ENOMEM;
priv->inode = inode;
- priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(priv->mm)) {
- int err = PTR_ERR(priv->mm);
+ priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) {
+ int err = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH;
seq_release_private(inode, file);
return err;
@@ -282,8 +276,8 @@ static int map_release(struct inode *inode, struct file *file)
struct seq_file *seq = file->private_data;
struct proc_maps_private *priv = seq->private;
- if (priv->mm)
- mmdrop(priv->mm);
+ if (priv->lock_ctx.mm)
+ mmdrop(priv->lock_ctx.mm);
return seq_release_private(inode, file);
}
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index a553273fbd41..d6113dbe58e0 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -31,12 +31,11 @@ static const struct inode_operations proc_thread_self_inode_operations = {
.get_link = proc_thread_self_get_link,
};
-static unsigned thread_self_inum __ro_after_init;
+unsigned thread_self_inum __ro_after_init;
int proc_setup_thread_self(struct super_block *s)
{
struct inode *root_inode = d_inode(s->s_root);
- struct proc_fs_info *fs_info = proc_sb_info(s);
struct dentry *thread_self;
int ret = -ENOMEM;
@@ -46,24 +45,20 @@ int proc_setup_thread_self(struct super_block *s)
struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = thread_self_inum;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ simple_inode_init_ts(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_op = &proc_thread_self_inode_operations;
- d_add(thread_self, inode);
+ d_make_persistent(thread_self, inode);
ret = 0;
- } else {
- dput(thread_self);
}
+ dput(thread_self);
}
inode_unlock(root_inode);
if (ret)
pr_err("proc_fill_super: can't allocate /proc/thread-self\n");
- else
- fs_info->proc_thread_self = thread_self;
-
return ret;
}
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 5a1b228964fb..b5343d209381 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -7,23 +7,28 @@
#include <linux/time.h>
#include <linux/time_namespace.h>
#include <linux/kernel_stat.h>
+#include "internal.h"
static int uptime_proc_show(struct seq_file *m, void *v)
{
struct timespec64 uptime;
struct timespec64 idle;
- u64 nsec;
+ u64 idle_nsec;
u32 rem;
int i;
- nsec = 0;
- for_each_possible_cpu(i)
- nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
+ idle_nsec = 0;
+ for_each_possible_cpu(i) {
+ struct kernel_cpustat kcs;
+
+ kcpustat_cpu_fetch(&kcs, i);
+ idle_nsec += get_idle_time(&kcs, i);
+ }
ktime_get_boottime_ts64(&uptime);
timens_add_boottime(&uptime);
- idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
+ idle.tv_sec = div_u64_rem(idle_nsec, NSEC_PER_SEC, &rem);
idle.tv_nsec = rem;
seq_printf(m, "%lu.%02lu %lu.%02lu\n",
(unsigned long) uptime.tv_sec,
@@ -35,7 +40,10 @@ static int uptime_proc_show(struct seq_file *m, void *v)
static int __init proc_uptime_init(void)
{
- proc_create_single("uptime", 0, NULL, uptime_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("uptime", 0, NULL, uptime_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_uptime_init);
diff --git a/fs/proc/version.c b/fs/proc/version.c
index b449f186577f..02e3c3cd4a9a 100644
--- a/fs/proc/version.c
+++ b/fs/proc/version.c
@@ -5,6 +5,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/utsname.h>
+#include "internal.h"
static int version_proc_show(struct seq_file *m, void *v)
{
@@ -17,7 +18,10 @@ static int version_proc_show(struct seq_file *m, void *v)
static int __init proc_version_init(void)
{
- proc_create_single("version", 0, NULL, version_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("version", 0, NULL, version_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_version_init);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index c3a345c28a93..f188bd900eb2 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -8,6 +8,8 @@
*
*/
+#define pr_fmt(fmt) "vmcore: " fmt
+
#include <linux/mm.h>
#include <linux/kcore.h>
#include <linux/user.h>
@@ -25,8 +27,8 @@
#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
-#include <linux/uaccess.h>
-#include <linux/mem_encrypt.h>
+#include <linux/uio.h>
+#include <linux/cc_platform.h>
#include <asm/io.h>
#include "internal.h"
@@ -51,9 +53,14 @@ static u64 vmcore_size;
static struct proc_dir_entry *proc_vmcore;
#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+struct vmcoredd_node {
+ struct list_head list; /* List of dumps */
+ void *buf; /* Buffer containing device's dump */
+ unsigned int size; /* Size of the buffer */
+};
+
/* Device Dump list and mutex to synchronize access to list */
static LIST_HEAD(vmcoredd_list);
-static DEFINE_MUTEX(vmcoredd_mutex);
static bool vmcoredd_disabled;
core_param(novmcoredd, vmcoredd_disabled, bool, 0);
@@ -62,54 +69,100 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0);
/* Device Dump Size */
static size_t vmcoredd_orig_sz;
-/*
- * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
- * The called function has to take care of module refcounting.
- */
-static int (*oldmem_pfn_is_ram)(unsigned long pfn);
+static DEFINE_MUTEX(vmcore_mutex);
+
+DEFINE_STATIC_SRCU(vmcore_cb_srcu);
+/* List of registered vmcore callbacks. */
+static LIST_HEAD(vmcore_cb_list);
+/* Whether the vmcore has been opened once. */
+static bool vmcore_opened;
+/* Whether the vmcore is currently open. */
+static unsigned int vmcore_open;
+
+static void vmcore_process_device_ram(struct vmcore_cb *cb);
-int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn))
+void register_vmcore_cb(struct vmcore_cb *cb)
{
- if (oldmem_pfn_is_ram)
- return -EBUSY;
- oldmem_pfn_is_ram = fn;
- return 0;
+ INIT_LIST_HEAD(&cb->next);
+ mutex_lock(&vmcore_mutex);
+ list_add_tail(&cb->next, &vmcore_cb_list);
+ /*
+ * Registering a vmcore callback after the vmcore was opened is
+ * very unusual (e.g., manual driver loading).
+ */
+ if (vmcore_opened)
+ pr_warn_once("Unexpected vmcore callback registration\n");
+ if (!vmcore_open && cb->get_device_ram)
+ vmcore_process_device_ram(cb);
+ mutex_unlock(&vmcore_mutex);
}
-EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram);
+EXPORT_SYMBOL_GPL(register_vmcore_cb);
-void unregister_oldmem_pfn_is_ram(void)
+void unregister_vmcore_cb(struct vmcore_cb *cb)
{
- oldmem_pfn_is_ram = NULL;
- wmb();
+ mutex_lock(&vmcore_mutex);
+ list_del_rcu(&cb->next);
+ /*
+ * Unregistering a vmcore callback after the vmcore was opened is
+ * very unusual (e.g., forced driver removal), but we cannot stop
+ * unregistering.
+ */
+ if (vmcore_opened)
+ pr_warn_once("Unexpected vmcore callback unregistration\n");
+ mutex_unlock(&vmcore_mutex);
+
+ synchronize_srcu(&vmcore_cb_srcu);
}
-EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram);
+EXPORT_SYMBOL_GPL(unregister_vmcore_cb);
-static int pfn_is_ram(unsigned long pfn)
+static bool pfn_is_ram(unsigned long pfn)
{
- int (*fn)(unsigned long pfn);
- /* pfn is ram unless fn() checks pagetype */
- int ret = 1;
+ struct vmcore_cb *cb;
+ bool ret = true;
- /*
- * Ask hypervisor if the pfn is really ram.
- * A ballooned page contains no data and reading from such a page
- * will cause high load in the hypervisor.
- */
- fn = oldmem_pfn_is_ram;
- if (fn)
- ret = fn(pfn);
+ list_for_each_entry_srcu(cb, &vmcore_cb_list, next,
+ srcu_read_lock_held(&vmcore_cb_srcu)) {
+ if (unlikely(!cb->pfn_is_ram))
+ continue;
+ ret = cb->pfn_is_ram(cb, pfn);
+ if (!ret)
+ break;
+ }
return ret;
}
+static int open_vmcore(struct inode *inode, struct file *file)
+{
+ mutex_lock(&vmcore_mutex);
+ vmcore_opened = true;
+ if (vmcore_open + 1 == 0) {
+ mutex_unlock(&vmcore_mutex);
+ return -EBUSY;
+ }
+ vmcore_open++;
+ mutex_unlock(&vmcore_mutex);
+
+ return 0;
+}
+
+static int release_vmcore(struct inode *inode, struct file *file)
+{
+ mutex_lock(&vmcore_mutex);
+ vmcore_open--;
+ mutex_unlock(&vmcore_mutex);
+
+ return 0;
+}
+
/* Reads a page from the oldmem device from given offset. */
-ssize_t read_from_oldmem(char *buf, size_t count,
- u64 *ppos, int userbuf,
- bool encrypted)
+ssize_t read_from_oldmem(struct iov_iter *iter, size_t count,
+ u64 *ppos, bool encrypted)
{
unsigned long pfn, offset;
- size_t nr_bytes;
+ ssize_t nr_bytes;
ssize_t read = 0, tmp;
+ int idx;
if (!count)
return 0;
@@ -117,6 +170,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
offset = (unsigned long)(*ppos % PAGE_SIZE);
pfn = (unsigned long)(*ppos / PAGE_SIZE);
+ idx = srcu_read_lock(&vmcore_cb_srcu);
do {
if (count > (PAGE_SIZE - offset))
nr_bytes = PAGE_SIZE - offset;
@@ -124,28 +178,29 @@ ssize_t read_from_oldmem(char *buf, size_t count,
nr_bytes = count;
/* If pfn is not ram, return zeros for sparse dump files */
- if (pfn_is_ram(pfn) == 0)
- memset(buf, 0, nr_bytes);
- else {
+ if (!pfn_is_ram(pfn)) {
+ tmp = iov_iter_zero(nr_bytes, iter);
+ } else {
if (encrypted)
- tmp = copy_oldmem_page_encrypted(pfn, buf,
+ tmp = copy_oldmem_page_encrypted(iter, pfn,
nr_bytes,
- offset,
- userbuf);
+ offset);
else
- tmp = copy_oldmem_page(pfn, buf, nr_bytes,
- offset, userbuf);
-
- if (tmp < 0)
- return tmp;
+ tmp = copy_oldmem_page(iter, pfn, nr_bytes,
+ offset);
+ }
+ if (tmp < nr_bytes) {
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
+ return -EFAULT;
}
+
*ppos += nr_bytes;
count -= nr_bytes;
- buf += nr_bytes;
read += nr_bytes;
++pfn;
offset = 0;
} while (count);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
return read;
}
@@ -169,7 +224,12 @@ void __weak elfcorehdr_free(unsigned long long addr)
*/
ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
{
- return read_from_oldmem(buf, count, ppos, 0, false);
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
+
+ return read_from_oldmem(&iter, count, ppos, false);
}
/*
@@ -177,7 +237,13 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
*/
ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
{
- return read_from_oldmem(buf, count, ppos, 0, mem_encrypt_active());
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
+
+ return read_from_oldmem(&iter, count, ppos,
+ cc_platform_has(CC_ATTR_MEM_ENCRYPT));
}
/*
@@ -194,60 +260,38 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
/*
* Architectures which support memory encryption override this.
*/
-ssize_t __weak
-copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
- unsigned long offset, int userbuf)
-{
- return copy_oldmem_page(pfn, buf, csize, offset, userbuf);
-}
-
-/*
- * Copy to either kernel or user space
- */
-static int copy_to(void *target, void *src, size_t size, int userbuf)
+ssize_t __weak copy_oldmem_page_encrypted(struct iov_iter *iter,
+ unsigned long pfn, size_t csize, unsigned long offset)
{
- if (userbuf) {
- if (copy_to_user((char __user *) target, src, size))
- return -EFAULT;
- } else {
- memcpy(target, src, size);
- }
- return 0;
+ return copy_oldmem_page(iter, pfn, csize, offset);
}
#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
-static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf)
+static int vmcoredd_copy_dumps(struct iov_iter *iter, u64 start, size_t size)
{
struct vmcoredd_node *dump;
u64 offset = 0;
- int ret = 0;
size_t tsz;
char *buf;
- mutex_lock(&vmcoredd_mutex);
list_for_each_entry(dump, &vmcoredd_list, list) {
if (start < offset + dump->size) {
tsz = min(offset + (u64)dump->size - start, (u64)size);
buf = dump->buf + start - offset;
- if (copy_to(dst, buf, tsz, userbuf)) {
- ret = -EFAULT;
- goto out_unlock;
- }
+ if (copy_to_iter(buf, tsz, iter) < tsz)
+ return -EFAULT;
size -= tsz;
start += tsz;
- dst += tsz;
/* Leave now if buffer filled already */
if (!size)
- goto out_unlock;
+ return 0;
}
offset += dump->size;
}
-out_unlock:
- mutex_unlock(&vmcoredd_mutex);
- return ret;
+ return 0;
}
#ifdef CONFIG_MMU
@@ -256,20 +300,16 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst,
{
struct vmcoredd_node *dump;
u64 offset = 0;
- int ret = 0;
size_t tsz;
char *buf;
- mutex_lock(&vmcoredd_mutex);
list_for_each_entry(dump, &vmcoredd_list, list) {
if (start < offset + dump->size) {
tsz = min(offset + (u64)dump->size - start, (u64)size);
buf = dump->buf + start - offset;
if (remap_vmalloc_range_partial(vma, dst, buf, 0,
- tsz)) {
- ret = -EFAULT;
- goto out_unlock;
- }
+ tsz))
+ return -EFAULT;
size -= tsz;
start += tsz;
@@ -277,14 +317,12 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst,
/* Leave now if buffer filled already */
if (!size)
- goto out_unlock;
+ return 0;
}
offset += dump->size;
}
-out_unlock:
- mutex_unlock(&vmcoredd_mutex);
- return ret;
+ return 0;
}
#endif /* CONFIG_MMU */
#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
@@ -292,37 +330,32 @@ out_unlock:
/* Read from the ELF header and then the crash dump. On error, negative value is
* returned otherwise number of bytes read are returned.
*/
-static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
- int userbuf)
+static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos)
{
+ struct vmcore_range *m = NULL;
ssize_t acc = 0, tmp;
size_t tsz;
u64 start;
- struct vmcore *m = NULL;
- if (buflen == 0 || *fpos >= vmcore_size)
+ if (!iov_iter_count(iter) || *fpos >= vmcore_size)
return 0;
- /* trim buflen to not go beyond EOF */
- if (buflen > vmcore_size - *fpos)
- buflen = vmcore_size - *fpos;
+ iov_iter_truncate(iter, vmcore_size - *fpos);
/* Read ELF core header */
if (*fpos < elfcorebuf_sz) {
- tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
- if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf))
+ tsz = min(elfcorebuf_sz - (size_t)*fpos, iov_iter_count(iter));
+ if (copy_to_iter(elfcorebuf + *fpos, tsz, iter) < tsz)
return -EFAULT;
- buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (buflen == 0)
+ if (!iov_iter_count(iter))
return acc;
}
- /* Read Elf note segment */
+ /* Read ELF note segment */
if (*fpos < elfcorebuf_sz + elfnotes_sz) {
void *kaddr;
@@ -339,71 +372,97 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
/* Read device dumps */
if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) {
tsz = min(elfcorebuf_sz + vmcoredd_orig_sz -
- (size_t)*fpos, buflen);
+ (size_t)*fpos, iov_iter_count(iter));
start = *fpos - elfcorebuf_sz;
- if (vmcoredd_copy_dumps(buffer, start, tsz, userbuf))
+ if (vmcoredd_copy_dumps(iter, start, tsz))
return -EFAULT;
- buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (!buflen)
+ if (!iov_iter_count(iter))
return acc;
}
#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
/* Read remaining elf notes */
- tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
+ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos,
+ iov_iter_count(iter));
kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz;
- if (copy_to(buffer, kaddr, tsz, userbuf))
+ if (copy_to_iter(kaddr, tsz, iter) < tsz)
return -EFAULT;
- buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (buflen == 0)
+ if (!iov_iter_count(iter))
return acc;
+
+ cond_resched();
}
list_for_each_entry(m, &vmcore_list, list) {
if (*fpos < m->offset + m->size) {
tsz = (size_t)min_t(unsigned long long,
m->offset + m->size - *fpos,
- buflen);
+ iov_iter_count(iter));
start = m->paddr + *fpos - m->offset;
- tmp = read_from_oldmem(buffer, tsz, &start,
- userbuf, mem_encrypt_active());
+ tmp = read_from_oldmem(iter, tsz, &start,
+ cc_platform_has(CC_ATTR_MEM_ENCRYPT));
if (tmp < 0)
return tmp;
- buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (buflen == 0)
+ if (!iov_iter_count(iter))
return acc;
}
+
+ cond_resched();
}
return acc;
}
-static ssize_t read_vmcore(struct file *file, char __user *buffer,
- size_t buflen, loff_t *fpos)
+static ssize_t read_vmcore(struct kiocb *iocb, struct iov_iter *iter)
+{
+ return __read_vmcore(iter, &iocb->ki_pos);
+}
+
+/**
+ * vmcore_alloc_buf - allocate buffer in vmalloc memory
+ * @size: size of buffer
+ *
+ * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
+ * the buffer to user-space by means of remap_vmalloc_range().
+ *
+ * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
+ * disabled and there's no need to allow users to mmap the buffer.
+ */
+static inline char *vmcore_alloc_buf(size_t size)
{
- return __read_vmcore((__force char *) buffer, buflen, fpos, 1);
+#ifdef CONFIG_MMU
+ return vmalloc_user(size);
+#else
+ return vzalloc(size);
+#endif
}
/*
+ * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is
+ * essential for mmap_vmcore() in order to map physically
+ * non-contiguous objects (ELF header, ELF note segment and memory
+ * regions in the 1st kernel pointed to by PT_LOAD entries) into
+ * virtually contiguous user-space in ELF layout.
+ */
+#ifdef CONFIG_MMU
+
+/*
* The vmcore fault handler uses the page cache and fills data using the
- * standard __vmcore_read() function.
+ * standard __read_vmcore() function.
*
* On s390 the fault handler is used for memory regions that can't be mapped
* directly with remap_pfn_range().
@@ -413,9 +472,10 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
#ifdef CONFIG_S390
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
pgoff_t index = vmf->pgoff;
+ struct iov_iter iter;
+ struct kvec kvec;
struct page *page;
loff_t offset;
- char *buf;
int rc;
page = find_or_create_page(mapping, index, GFP_KERNEL);
@@ -423,8 +483,11 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
return VM_FAULT_OOM;
if (!PageUptodate(page)) {
offset = (loff_t) index << PAGE_SHIFT;
- buf = __va((page_to_pfn(page) << PAGE_SHIFT));
- rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0);
+ kvec.iov_base = page_address(page);
+ kvec.iov_len = PAGE_SIZE;
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, PAGE_SIZE);
+
+ rc = __read_vmcore(&iter, &offset);
if (rc < 0) {
unlock_page(page);
put_page(page);
@@ -444,33 +507,6 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
.fault = mmap_vmcore_fault,
};
-/**
- * vmcore_alloc_buf - allocate buffer in vmalloc memory
- * @sizez: size of buffer
- *
- * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
- * the buffer to user-space by means of remap_vmalloc_range().
- *
- * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
- * disabled and there's no need to allow users to mmap the buffer.
- */
-static inline char *vmcore_alloc_buf(size_t size)
-{
-#ifdef CONFIG_MMU
- return vmalloc_user(size);
-#else
- return vzalloc(size);
-#endif
-}
-
-/*
- * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is
- * essential for mmap_vmcore() in order to map physically
- * non-contiguous objects (ELF header, ELF note segment and memory
- * regions in the 1st kernel pointed to by PT_LOAD entries) into
- * virtually contiguous user-space in ELF layout.
- */
-#ifdef CONFIG_MMU
/*
* remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages
* reported as not being ram with the zero page.
@@ -537,21 +573,26 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma,
unsigned long from, unsigned long pfn,
unsigned long size, pgprot_t prot)
{
+ int ret, idx;
+
/*
- * Check if oldmem_pfn_is_ram was registered to avoid
- * looping over all pages without a reason.
+ * Check if a callback was registered to avoid looping over all
+ * pages without a reason.
*/
- if (oldmem_pfn_is_ram)
- return remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
+ idx = srcu_read_lock(&vmcore_cb_srcu);
+ if (!list_empty(&vmcore_cb_list))
+ ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
else
- return remap_oldmem_pfn_range(vma, from, pfn, size, prot);
+ ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
+ return ret;
}
static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
{
size_t size = vma->vm_end - vma->vm_start;
u64 start, end, len, tsz;
- struct vmcore *m;
+ struct vmcore_range *m;
start = (u64)vma->vm_pgoff << PAGE_SHIFT;
end = start + size;
@@ -562,8 +603,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
if (vma->vm_flags & (VM_WRITE | VM_EXEC))
return -EPERM;
- vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
- vma->vm_flags |= VM_MIXEDMAP;
+ vm_flags_mod(vma, VM_MIXEDMAP, VM_MAYWRITE | VM_MAYEXEC);
vma->vm_ops = &vmcore_mmap_ops;
len = 0;
@@ -668,21 +708,18 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
#endif
static const struct proc_ops vmcore_proc_ops = {
- .proc_read = read_vmcore,
+ .proc_open = open_vmcore,
+ .proc_release = release_vmcore,
+ .proc_read_iter = read_vmcore,
.proc_lseek = default_llseek,
.proc_mmap = mmap_vmcore,
};
-static struct vmcore* __init get_new_element(void)
-{
- return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
-}
-
static u64 get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
struct list_head *vc_list)
{
+ struct vmcore_range *m;
u64 size;
- struct vmcore *m;
size = elfsz + elfnotesegsz;
list_for_each_entry(m, vc_list, list) {
@@ -857,7 +894,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
phdr.p_offset = roundup(note_off, PAGE_SIZE);
phdr.p_vaddr = phdr.p_paddr = 0;
phdr.p_filesz = phdr.p_memsz = phdr_sz;
- phdr.p_align = 0;
+ phdr.p_align = 4;
/* Add merged PT_NOTE program header*/
tmp = elfptr + sizeof(Elf64_Ehdr);
@@ -1048,7 +1085,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
phdr.p_offset = roundup(note_off, PAGE_SIZE);
phdr.p_vaddr = phdr.p_paddr = 0;
phdr.p_filesz = phdr.p_memsz = phdr_sz;
- phdr.p_align = 0;
+ phdr.p_align = 4;
/* Add merged PT_NOTE program header*/
tmp = elfptr + sizeof(Elf32_Ehdr);
@@ -1084,12 +1121,11 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
Elf64_Ehdr *ehdr_ptr;
Elf64_Phdr *phdr_ptr;
loff_t vmcore_off;
- struct vmcore *new;
ehdr_ptr = (Elf64_Ehdr *)elfptr;
phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
- /* Skip Elf header, program headers and Elf note segment. */
+ /* Skip ELF header, program headers and ELF note segment. */
vmcore_off = elfsz + elfnotes_sz;
for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
@@ -1103,13 +1139,8 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
size = end - start;
- /* Add this contiguous chunk of memory to vmcore list.*/
- new = get_new_element();
- if (!new)
+ if (vmcore_alloc_add_range(vc_list, start, size))
return -ENOMEM;
- new->paddr = start;
- new->size = size;
- list_add_tail(&new->list, vc_list);
/* Update the program header offset. */
phdr_ptr->p_offset = vmcore_off + (paddr - start);
@@ -1127,12 +1158,11 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
Elf32_Ehdr *ehdr_ptr;
Elf32_Phdr *phdr_ptr;
loff_t vmcore_off;
- struct vmcore *new;
ehdr_ptr = (Elf32_Ehdr *)elfptr;
phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
- /* Skip Elf header, program headers and Elf note segment. */
+ /* Skip ELF header, program headers and ELF note segment. */
vmcore_off = elfsz + elfnotes_sz;
for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
@@ -1146,13 +1176,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
size = end - start;
- /* Add this contiguous chunk of memory to vmcore list.*/
- new = get_new_element();
- if (!new)
+ if (vmcore_alloc_add_range(vc_list, start, size))
return -ENOMEM;
- new->paddr = start;
- new->size = size;
- list_add_tail(&new->list, vc_list);
/* Update the program header offset */
phdr_ptr->p_offset = vmcore_off + (paddr - start);
@@ -1165,10 +1190,10 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
struct list_head *vc_list)
{
+ struct vmcore_range *m;
loff_t vmcore_off;
- struct vmcore *m;
- /* Skip Elf header, program headers and Elf note segment. */
+ /* Skip ELF header, program headers and ELF note segment. */
vmcore_off = elfsz + elfnotes_sz;
list_for_each_entry(m, vc_list, list) {
@@ -1193,7 +1218,7 @@ static int __init parse_crash_elf64_headers(void)
addr = elfcorehdr_addr;
- /* Read Elf header */
+ /* Read ELF header */
rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr);
if (rc < 0)
return rc;
@@ -1249,7 +1274,7 @@ static int __init parse_crash_elf32_headers(void)
addr = elfcorehdr_addr;
- /* Read Elf header */
+ /* Read ELF header */
rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr);
if (rc < 0)
return rc;
@@ -1350,18 +1375,17 @@ static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data,
vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name);
vdd_hdr->n_type = NT_VMCOREDD;
- strncpy((char *)vdd_hdr->name, VMCOREDD_NOTE_NAME,
- sizeof(vdd_hdr->name));
- memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name));
+ strscpy_pad(vdd_hdr->name, VMCOREDD_NOTE_NAME);
+ strscpy_pad(vdd_hdr->dump_name, data->dump_name);
}
/**
- * vmcoredd_update_program_headers - Update all Elf program headers
+ * vmcoredd_update_program_headers - Update all ELF program headers
* @elfptr: Pointer to elf header
* @elfnotesz: Size of elf notes aligned to page size
* @vmcoreddsz: Size of device dumps to be added to elf note header
*
- * Determine type of Elf header (Elf64 or Elf32) and update the elf note size.
+ * Determine type of ELF header (Elf64 or Elf32) and update the elf note size.
* Also update the offsets of all the program headers after the elf note header.
*/
static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz,
@@ -1419,10 +1443,10 @@ static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz,
/**
* vmcoredd_update_size - Update the total size of the device dumps and update
- * Elf header
+ * ELF header
* @dump_size: Size of the current device dump to be added to total size
*
- * Update the total size of all the device dumps and update the Elf program
+ * Update the total size of all the device dumps and update the ELF program
* headers. Calculate the new offsets for the vmcore list and update the
* total vmcore size.
*/
@@ -1446,7 +1470,7 @@ static void vmcoredd_update_size(size_t dump_size)
* @data: dump info.
*
* Allocate a buffer and invoke the calling driver's dump collect routine.
- * Write Elf note at the beginning of the buffer to indicate vmcore device
+ * Write ELF note at the beginning of the buffer to indicate vmcore device
* dump and add the dump to global list.
*/
int vmcore_add_device_dump(struct vmcoredd_data *data)
@@ -1466,10 +1490,8 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
return -EINVAL;
dump = vzalloc(sizeof(*dump));
- if (!dump) {
- ret = -ENOMEM;
- goto out_err;
- }
+ if (!dump)
+ return -ENOMEM;
/* Keep size of the buffer page aligned so that it can be mmaped */
data_size = roundup(sizeof(struct vmcoredd_header) + data->size,
@@ -1494,31 +1516,186 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
dump->buf = buf;
dump->size = data_size;
- /* Add the dump to driver sysfs list */
- mutex_lock(&vmcoredd_mutex);
- list_add_tail(&dump->list, &vmcoredd_list);
- mutex_unlock(&vmcoredd_mutex);
+ /* Add the dump to driver sysfs list and update the elfcore hdr */
+ scoped_guard(mutex, &vmcore_mutex) {
+ if (vmcore_opened)
+ pr_warn_once("Unexpected adding of device dump\n");
+ if (vmcore_open) {
+ ret = -EBUSY;
+ goto out_err;
+ }
- vmcoredd_update_size(data_size);
+ list_add_tail(&dump->list, &vmcoredd_list);
+ vmcoredd_update_size(data_size);
+ }
return 0;
out_err:
- if (buf)
- vfree(buf);
-
- if (dump)
- vfree(dump);
+ vfree(buf);
+ vfree(dump);
return ret;
}
EXPORT_SYMBOL(vmcore_add_device_dump);
#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+#ifdef CONFIG_PROC_VMCORE_DEVICE_RAM
+static int vmcore_realloc_elfcore_buffer_elf64(size_t new_size)
+{
+ char *elfcorebuf_new;
+
+ if (WARN_ON_ONCE(new_size < elfcorebuf_sz))
+ return -EINVAL;
+ if (get_order(elfcorebuf_sz_orig) == get_order(new_size)) {
+ elfcorebuf_sz_orig = new_size;
+ return 0;
+ }
+
+ elfcorebuf_new = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(new_size));
+ if (!elfcorebuf_new)
+ return -ENOMEM;
+ memcpy(elfcorebuf_new, elfcorebuf, elfcorebuf_sz);
+ free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig));
+ elfcorebuf = elfcorebuf_new;
+ elfcorebuf_sz_orig = new_size;
+ return 0;
+}
+
+static void vmcore_reset_offsets_elf64(void)
+{
+ Elf64_Phdr *phdr_start = (Elf64_Phdr *)(elfcorebuf + sizeof(Elf64_Ehdr));
+ loff_t vmcore_off = elfcorebuf_sz + elfnotes_sz;
+ Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfcorebuf;
+ Elf64_Phdr *phdr;
+ int i;
+
+ for (i = 0, phdr = phdr_start; i < ehdr->e_phnum; i++, phdr++) {
+ u64 start, end;
+
+ /*
+ * After merge_note_headers_elf64() we should only have a single
+ * PT_NOTE entry that starts immediately after elfcorebuf_sz.
+ */
+ if (phdr->p_type == PT_NOTE) {
+ phdr->p_offset = elfcorebuf_sz;
+ continue;
+ }
+
+ start = rounddown(phdr->p_offset, PAGE_SIZE);
+ end = roundup(phdr->p_offset + phdr->p_memsz, PAGE_SIZE);
+ phdr->p_offset = vmcore_off + (phdr->p_offset - start);
+ vmcore_off = vmcore_off + end - start;
+ }
+ set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
+}
+
+static int vmcore_add_device_ram_elf64(struct list_head *list, size_t count)
+{
+ Elf64_Phdr *phdr_start = (Elf64_Phdr *)(elfcorebuf + sizeof(Elf64_Ehdr));
+ Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfcorebuf;
+ struct vmcore_range *cur;
+ Elf64_Phdr *phdr;
+ size_t new_size;
+ int rc;
+
+ if ((Elf32_Half)(ehdr->e_phnum + count) != ehdr->e_phnum + count) {
+ pr_err("too many device ram ranges\n");
+ return -ENOSPC;
+ }
+
+ /* elfcorebuf_sz must always cover full pages. */
+ new_size = sizeof(Elf64_Ehdr) +
+ (ehdr->e_phnum + count) * sizeof(Elf64_Phdr);
+ new_size = roundup(new_size, PAGE_SIZE);
+
+ /*
+ * Make sure we have sufficient space to include the new PT_LOAD
+ * entries.
+ */
+ rc = vmcore_realloc_elfcore_buffer_elf64(new_size);
+ if (rc) {
+ pr_err("resizing elfcore failed\n");
+ return rc;
+ }
+
+ /* Modify our used elfcore buffer size to cover the new entries. */
+ elfcorebuf_sz = new_size;
+
+ /* Fill the added PT_LOAD entries. */
+ phdr = phdr_start + ehdr->e_phnum;
+ list_for_each_entry(cur, list, list) {
+ WARN_ON_ONCE(!IS_ALIGNED(cur->paddr | cur->size, PAGE_SIZE));
+ elfcorehdr_fill_device_ram_ptload_elf64(phdr, cur->paddr, cur->size);
+
+ /* p_offset will be adjusted later. */
+ phdr++;
+ ehdr->e_phnum++;
+ }
+ list_splice_tail(list, &vmcore_list);
+
+ /* We changed elfcorebuf_sz and added new entries; reset all offsets. */
+ vmcore_reset_offsets_elf64();
+
+ /* Finally, recalculate the total vmcore size. */
+ vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
+ &vmcore_list);
+ proc_vmcore->size = vmcore_size;
+ return 0;
+}
+
+static void vmcore_process_device_ram(struct vmcore_cb *cb)
+{
+ unsigned char *e_ident = (unsigned char *)elfcorebuf;
+ struct vmcore_range *first, *m;
+ LIST_HEAD(list);
+ int count;
+
+ /* We only support Elf64 dumps for now. */
+ if (WARN_ON_ONCE(e_ident[EI_CLASS] != ELFCLASS64)) {
+ pr_err("device ram ranges only support Elf64\n");
+ return;
+ }
+
+ if (cb->get_device_ram(cb, &list)) {
+ pr_err("obtaining device ram ranges failed\n");
+ return;
+ }
+ count = list_count_nodes(&list);
+ if (!count)
+ return;
+
+ /*
+ * For some reason these ranges are already know? Might happen
+ * with unusual register->unregister->register sequences; we'll simply
+ * sanity check using the first range.
+ */
+ first = list_first_entry(&list, struct vmcore_range, list);
+ list_for_each_entry(m, &vmcore_list, list) {
+ unsigned long long m_end = m->paddr + m->size;
+ unsigned long long first_end = first->paddr + first->size;
+
+ if (first->paddr < m_end && m->paddr < first_end)
+ goto out_free;
+ }
+
+ /* If adding the mem nodes succeeds, they must not be freed. */
+ if (!vmcore_add_device_ram_elf64(&list, count))
+ return;
+out_free:
+ vmcore_free_ranges(&list);
+}
+#else /* !CONFIG_PROC_VMCORE_DEVICE_RAM */
+static void vmcore_process_device_ram(struct vmcore_cb *cb)
+{
+}
+#endif /* CONFIG_PROC_VMCORE_DEVICE_RAM */
+
/* Free all dumps in vmcore device dump list */
static void vmcore_free_device_dumps(void)
{
#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
- mutex_lock(&vmcoredd_mutex);
+ mutex_lock(&vmcore_mutex);
while (!list_empty(&vmcoredd_list)) {
struct vmcoredd_node *dump;
@@ -1528,7 +1705,7 @@ static void vmcore_free_device_dumps(void)
vfree(dump->buf);
vfree(dump);
}
- mutex_unlock(&vmcoredd_mutex);
+ mutex_unlock(&vmcore_mutex);
#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
}
@@ -1549,7 +1726,8 @@ static int __init vmcore_init(void)
return rc;
rc = parse_crash_elf_headers();
if (rc) {
- pr_warn("Kdump: vmcore not initialized\n");
+ elfcorehdr_free(elfcorehdr_addr);
+ pr_warn("not initialized\n");
return rc;
}
elfcorehdr_free(elfcorehdr_addr);
@@ -1570,14 +1748,7 @@ void vmcore_cleanup(void)
proc_vmcore = NULL;
}
- /* clear the vmcore list. */
- while (!list_empty(&vmcore_list)) {
- struct vmcore *m;
-
- m = list_first_entry(&vmcore_list, struct vmcore, list);
- list_del(&m->list);
- kfree(m);
- }
+ vmcore_free_ranges(&vmcore_list);
free_elfcorebuf();
/* clear vmcore device dump list */