summaryrefslogtreecommitdiff
path: root/fs/proc
diff options
context:
space:
mode:
Diffstat (limited to 'fs/proc')
-rw-r--r--fs/proc/Kconfig21
-rw-r--r--fs/proc/Makefile2
-rw-r--r--fs/proc/array.c76
-rw-r--r--fs/proc/base.c355
-rw-r--r--fs/proc/bootconfig.c6
-rw-r--r--fs/proc/consoles.c7
-rw-r--r--fs/proc/fd.c97
-rw-r--r--fs/proc/generic.c69
-rw-r--r--fs/proc/inode.c90
-rw-r--r--fs/proc/internal.h128
-rw-r--r--fs/proc/interrupts.c4
-rw-r--r--fs/proc/kcore.c173
-rw-r--r--fs/proc/meminfo.c26
-rw-r--r--fs/proc/namespaces.c11
-rw-r--r--fs/proc/nommu.c2
-rw-r--r--fs/proc/page.c285
-rw-r--r--fs/proc/proc_net.c4
-rw-r--r--fs/proc/proc_sysctl.c341
-rw-r--r--fs/proc/root.c127
-rw-r--r--fs/proc/self.c12
-rw-r--r--fs/proc/softirqs.c2
-rw-r--r--fs/proc/stat.c4
-rw-r--r--fs/proc/task_mmu.c1965
-rw-r--r--fs/proc/task_nommu.c93
-rw-r--r--fs/proc/thread_self.c13
-rw-r--r--fs/proc/vmcore.c355
26 files changed, 3037 insertions, 1231 deletions
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 32b1116ae137..6ae966c561e7 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -32,7 +32,7 @@ config PROC_FS
config PROC_KCORE
bool "/proc/kcore support" if !ARM
depends on PROC_FS && MMU
- select CRASH_CORE
+ select VMCORE_INFO
help
Provides a virtual ELF core file of the live kernel. This can
be read with gdb and other ELF tools. No modifications can be
@@ -61,6 +61,25 @@ config PROC_VMCORE_DEVICE_DUMP
as ELF notes to /proc/vmcore. You can still disable device
dump using the kernel command line option 'novmcoredd'.
+config NEED_PROC_VMCORE_DEVICE_RAM
+ bool
+
+config PROC_VMCORE_DEVICE_RAM
+ def_bool y
+ depends on PROC_VMCORE && NEED_PROC_VMCORE_DEVICE_RAM
+ depends on VIRTIO_MEM
+ help
+ If the elfcore hdr is allocated and prepared by the dump kernel
+ ("2nd kernel") instead of the crashed kernel, RAM provided by memory
+ devices such as virtio-mem will not be included in the dump
+ image, because only the device driver can properly detect them.
+
+ With this config enabled, these RAM ranges will be queried from the
+ device drivers once the device gets probed, so they can be included
+ in the crash dump.
+
+ Relevant architectures should select NEED_PROC_VMCORE_DEVICE_RAM.
+
config PROC_SYSCTL
bool "Sysctl support (/proc/sys)" if EXPERT
depends on PROC_FS
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index bd08616ed8ba..7b4db9c56e6a 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -5,7 +5,7 @@
obj-y += proc.o
-CFLAGS_task_mmu.o += $(call cc-option,-Wno-override-init,)
+CFLAGS_task_mmu.o += -Wno-override-init
proc-y := nommu.o task_nommu.o
proc-$(CONFIG_MMU) := task_mmu.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d35bbf35a874..42932f88141a 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -109,7 +109,7 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
else if (p->flags & PF_KTHREAD)
get_kthread_comm(tcomm, sizeof(tcomm), p);
else
- __get_task_comm(tcomm, sizeof(tcomm), p);
+ get_task_comm(tcomm, p);
if (escape)
seq_escape_str(m, tcomm, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
@@ -157,13 +157,11 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
unsigned int max_fds = 0;
rcu_read_lock();
- ppid = pid_alive(p) ?
- task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
-
tracer = ptrace_parent(p);
if (tracer)
tpid = task_pid_nr_ns(tracer, ns);
+ ppid = task_ppid_nr_ns(p, ns);
tgid = task_tgid_nr_ns(p, ns);
ngid = task_numa_group_id(p);
cred = get_task_cred(p);
@@ -422,7 +420,7 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm)
bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE);
if (thp_enabled)
- thp_enabled = !test_bit(MMF_DISABLE_THP, &mm->flags);
+ thp_enabled = !mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
seq_printf(m, "THP_enabled:\t%d\n", thp_enabled);
}
@@ -431,6 +429,11 @@ static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm)
seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm));
}
+__weak void arch_proc_pid_thread_features(struct seq_file *m,
+ struct task_struct *task)
+{
+}
+
int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
@@ -455,6 +458,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
task_cpus_allowed(m, task);
cpuset_task_status_allowed(m, task);
task_context_switch_counts(m, task);
+ arch_proc_pid_thread_features(m, task);
return 0;
}
@@ -471,13 +475,12 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
int permitted;
struct mm_struct *mm;
unsigned long long start_time;
- unsigned long cmin_flt = 0, cmaj_flt = 0;
- unsigned long min_flt = 0, maj_flt = 0;
- u64 cutime, cstime, utime, stime;
- u64 cgtime, gtime;
+ unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt;
+ u64 cutime, cstime, cgtime, utime, stime, gtime;
unsigned long rsslim = 0;
unsigned long flags;
int exit_code = task->exit_code;
+ struct signal_struct *sig = task->signal;
state = *get_task_state(task);
vsize = eip = esp = 0;
@@ -494,7 +497,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
* a program is not able to use ptrace(2) in that case. It is
* safe because the task has stopped executing permanently.
*/
- if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) {
+ if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE|PF_POSTCOREDUMP))) {
if (try_get_task_stack(task)) {
eip = KSTK_EIP(task);
esp = KSTK_ESP(task);
@@ -505,12 +508,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
sigemptyset(&sigign);
sigemptyset(&sigcatch);
- cutime = cstime = utime = stime = 0;
- cgtime = gtime = 0;
if (lock_task_sighand(task, &flags)) {
- struct signal_struct *sig = task->signal;
-
if (sig->tty) {
struct pid *pgrp = tty_get_pgrp(sig->tty);
tty_pgrp = pid_nr_ns(pgrp, ns);
@@ -521,27 +520,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
num_threads = get_nr_threads(task);
collect_sigign_sigcatch(task, &sigign, &sigcatch);
- cmin_flt = sig->cmin_flt;
- cmaj_flt = sig->cmaj_flt;
- cutime = sig->cutime;
- cstime = sig->cstime;
- cgtime = sig->cgtime;
rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
- /* add up live thread stats at the group level */
if (whole) {
- struct task_struct *t = task;
- do {
- min_flt += t->min_flt;
- maj_flt += t->maj_flt;
- gtime += task_gtime(t);
- } while_each_thread(task, t);
-
- min_flt += sig->min_flt;
- maj_flt += sig->maj_flt;
- thread_group_cputime_adjusted(task, &utime, &stime);
- gtime += sig->gtime;
-
if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED))
exit_code = sig->group_exit_code;
}
@@ -555,10 +536,37 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
if (permitted && (!whole || num_threads < 2))
wchan = !task_is_running(task);
- if (!whole) {
+
+ scoped_guard(rcu) {
+ scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) {
+ cmin_flt = sig->cmin_flt;
+ cmaj_flt = sig->cmaj_flt;
+ cutime = sig->cutime;
+ cstime = sig->cstime;
+ cgtime = sig->cgtime;
+
+ if (whole) {
+ struct task_struct *t;
+
+ min_flt = sig->min_flt;
+ maj_flt = sig->maj_flt;
+ gtime = sig->gtime;
+
+ __for_each_thread(sig, t) {
+ min_flt += t->min_flt;
+ maj_flt += t->maj_flt;
+ gtime += task_gtime(t);
+ }
+ }
+ }
+ }
+
+ if (whole) {
+ thread_group_cputime_adjusted(task, &utime, &stime);
+ } else {
+ task_cputime_adjusted(task, &utime, &stime);
min_flt = task->min_flt;
maj_flt = task->maj_flt;
- task_cputime_adjusted(task, &utime, &stime);
gtime = task_gtime(task);
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 05452c3b9872..4eec684baca9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -58,7 +58,6 @@
#include <linux/init.h>
#include <linux/capability.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/generic-radix-tree.h>
#include <linux/string.h>
#include <linux/seq_file.h>
@@ -85,6 +84,7 @@
#include <linux/elf.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
+#include <linux/fs_parser.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
@@ -97,6 +97,7 @@
#include <linux/resctrl.h>
#include <linux/cn_proc.h>
#include <linux/ksm.h>
+#include <uapi/linux/lsm.h>
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"
@@ -116,6 +117,40 @@
static u8 nlink_tid __ro_after_init;
static u8 nlink_tgid __ro_after_init;
+enum proc_mem_force {
+ PROC_MEM_FORCE_ALWAYS,
+ PROC_MEM_FORCE_PTRACE,
+ PROC_MEM_FORCE_NEVER
+};
+
+static enum proc_mem_force proc_mem_force_override __ro_after_init =
+ IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER :
+ IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE :
+ PROC_MEM_FORCE_ALWAYS;
+
+static const struct constant_table proc_mem_force_table[] __initconst = {
+ { "always", PROC_MEM_FORCE_ALWAYS },
+ { "ptrace", PROC_MEM_FORCE_PTRACE },
+ { "never", PROC_MEM_FORCE_NEVER },
+ { }
+};
+
+static int __init early_proc_mem_force_override(char *buf)
+{
+ if (!buf)
+ return -EINVAL;
+
+ /*
+ * lookup_constant() defaults to proc_mem_force_override to preseve
+ * the initial Kconfig choice in case an invalid param gets passed.
+ */
+ proc_mem_force_override = lookup_constant(proc_mem_force_table,
+ buf, proc_mem_force_override);
+
+ return 0;
+}
+early_param("proc_mem.force_override", early_proc_mem_force_override);
+
struct pid_entry {
const char *name;
unsigned int len;
@@ -146,10 +181,10 @@ struct pid_entry {
NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_single_file_operations, \
{ .proc_show = show } )
-#define ATTR(LSM, NAME, MODE) \
+#define ATTR(LSMID, NAME, MODE) \
NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_pid_attr_operations, \
- { .lsm = LSM })
+ { .lsmid = LSMID })
/*
* Count the number of hardlinks for the pid_entry table, excluding the .
@@ -381,7 +416,7 @@ static const struct file_operations proc_pid_cmdline_ops = {
#ifdef CONFIG_KALLSYMS
/*
* Provides a wchan file via kallsyms in a proper one-value-per-file format.
- * Returns the resolved symbol. If that fails, simply return the address.
+ * Returns the resolved symbol to user space.
*/
static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
@@ -792,23 +827,31 @@ static const struct file_operations proc_single_file_operations = {
.release = single_release,
};
-
+/*
+ * proc_mem_open() can return errno, NULL or mm_struct*.
+ *
+ * - Returns NULL if the task has no mm (PF_KTHREAD or PF_EXITING)
+ * - Returns mm_struct* on success
+ * - Returns error code on failure
+ */
struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
{
struct task_struct *task = get_proc_task(inode);
- struct mm_struct *mm = ERR_PTR(-ESRCH);
+ struct mm_struct *mm;
- if (task) {
- mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
- put_task_struct(task);
+ if (!task)
+ return ERR_PTR(-ESRCH);
- if (!IS_ERR_OR_NULL(mm)) {
- /* ensure this mm_struct can't be freed */
- mmgrab(mm);
- /* but do not pin its memory */
- mmput(mm);
- }
- }
+ mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
+ put_task_struct(task);
+
+ if (IS_ERR(mm))
+ return mm == ERR_PTR(-ESRCH) ? NULL : mm;
+
+ /* ensure this mm_struct can't be freed */
+ mmgrab(mm);
+ /* but do not pin its memory */
+ mmput(mm);
return mm;
}
@@ -817,8 +860,8 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
{
struct mm_struct *mm = proc_mem_open(inode, mode);
- if (IS_ERR(mm))
- return PTR_ERR(mm);
+ if (IS_ERR_OR_NULL(mm))
+ return mm ? PTR_ERR(mm) : -ESRCH;
file->private_data = mm;
return 0;
@@ -826,12 +869,31 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
static int mem_open(struct inode *inode, struct file *file)
{
- int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
-
- /* OK to pass negative loff_t, we can catch out-of-range */
- file->f_mode |= FMODE_UNSIGNED_OFFSET;
+ if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET)))
+ return -EINVAL;
+ return __mem_open(inode, file, PTRACE_MODE_ATTACH);
+}
- return ret;
+static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm)
+{
+ struct task_struct *task;
+ bool ptrace_active = false;
+
+ switch (proc_mem_force_override) {
+ case PROC_MEM_FORCE_NEVER:
+ return false;
+ case PROC_MEM_FORCE_PTRACE:
+ task = get_proc_task(file_inode(file));
+ if (task) {
+ ptrace_active = READ_ONCE(task->ptrace) &&
+ READ_ONCE(task->mm) == mm &&
+ READ_ONCE(task->parent) == current;
+ put_task_struct(task);
+ }
+ return ptrace_active;
+ default:
+ return true;
+ }
}
static ssize_t mem_rw(struct file *file, char __user *buf,
@@ -854,7 +916,9 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
if (!mmget_not_zero(mm))
goto free;
- flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
+ flags = write ? FOLL_WRITE : 0;
+ if (proc_mem_foll_force(file, mm))
+ flags |= FOLL_FORCE;
while (count > 0) {
size_t this_len = min_t(size_t, count, PAGE_SIZE);
@@ -931,6 +995,7 @@ static const struct file_operations proc_mem_operations = {
.write = mem_write,
.open = mem_open,
.release = mem_release,
+ .fop_flags = FOP_UNSIGNED_OFFSET,
};
static int environ_open(struct inode *inode, struct file *file)
@@ -1098,7 +1163,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
struct task_struct *p = find_lock_task_mm(task);
if (p) {
- if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) {
+ if (mm_flags_test(MMF_MULTIPROCESS, p->mm)) {
mm = p->mm;
mmgrab(mm);
}
@@ -1153,11 +1218,10 @@ err_unlock:
static ssize_t oom_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int oom_adj;
int err;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count)) {
@@ -1213,11 +1277,10 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int oom_score_adj;
int err;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count)) {
@@ -1358,13 +1421,13 @@ static ssize_t proc_fault_inject_write(struct file * file,
const char __user * buf, size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int make_it_fail;
int rv;
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- memset(buffer, 0, sizeof(buffer));
+
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
@@ -1432,7 +1495,6 @@ static const struct file_operations proc_fail_nth_operations = {
#endif
-#ifdef CONFIG_SCHED_DEBUG
/*
* Print out various scheduling related per-task fields:
*/
@@ -1482,8 +1544,6 @@ static const struct file_operations proc_pid_sched_operations = {
.release = single_release,
};
-#endif
-
#ifdef CONFIG_SCHED_AUTOGROUP
/*
* Print out autogroup related information:
@@ -1509,11 +1569,10 @@ sched_autogroup_write(struct file *file, const char __user *buf,
{
struct inode *inode = file_inode(file);
struct task_struct *p;
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int nice;
int err;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
@@ -1666,10 +1725,9 @@ static ssize_t comm_write(struct file *file, const char __user *buf,
{
struct inode *inode = file_inode(file);
struct task_struct *p;
- char buffer[TASK_COMM_LEN];
+ char buffer[TASK_COMM_LEN] = {};
const size_t maxlen = sizeof(buffer) - 1;
- memset(buffer, 0, sizeof(buffer));
if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
return -EFAULT;
@@ -1881,8 +1939,6 @@ void proc_pid_evict_inode(struct proc_inode *ei)
hlist_del_init_rcu(&ei->sibling_inodes);
spin_unlock(&pid->lock);
}
-
- put_pid(pid);
}
struct inode *proc_pid_make_inode(struct super_block *sb,
@@ -1902,7 +1958,7 @@ struct inode *proc_pid_make_inode(struct super_block *sb,
ei = PROC_I(inode);
inode->i_mode = mode;
inode->i_ino = get_next_ino();
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ simple_inode_init_ts(inode);
inode->i_op = &proc_def_inode_operations;
/*
@@ -1966,7 +2022,7 @@ int pid_getattr(struct mnt_idmap *idmap, const struct path *path,
struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
struct task_struct *task;
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->uid = GLOBAL_ROOT_UID;
stat->gid = GLOBAL_ROOT_GID;
@@ -2005,7 +2061,8 @@ void pid_update_inode(struct task_struct *task, struct inode *inode)
* performed a setuid(), etc.
*
*/
-static int pid_revalidate(struct dentry *dentry, unsigned int flags)
+static int pid_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
struct task_struct *task;
@@ -2070,7 +2127,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
unsigned type = DT_UNKNOWN;
ino_t ino = 1;
- child = d_hash_and_lookup(dir, &qname);
+ child = try_lookup_noperm(&qname, dir);
if (!child) {
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
child = d_alloc_parallel(dir, &qname, &wq);
@@ -2138,7 +2195,8 @@ static int dname_to_vma_addr(struct dentry *dentry,
return 0;
}
-static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int map_files_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
unsigned long vm_start, vm_end;
bool exact_vma_exists = false;
@@ -2156,7 +2214,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
goto out_notask;
mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
- if (IS_ERR_OR_NULL(mm))
+ if (IS_ERR(mm))
goto out;
if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
@@ -2218,7 +2276,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
rc = -ENOENT;
vma = find_exact_vma(mm, vm_start, vm_end);
if (vma && vma->vm_file) {
- *path = vma->vm_file->f_path;
+ *path = *file_user_path(vma->vm_file);
path_get(path);
rc = 0;
}
@@ -2281,8 +2339,8 @@ proc_map_files_instantiate(struct dentry *dentry,
inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
- d_set_d_op(dentry, &tid_map_files_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return proc_splice_unmountable(inode, dentry,
+ &tid_map_files_dentry_operations);
}
static struct dentry *proc_map_files_lookup(struct inode *dir,
@@ -2442,11 +2500,9 @@ static const struct file_operations proc_map_files_operations = {
#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
struct timers_private {
- struct pid *pid;
- struct task_struct *task;
- struct sighand_struct *sighand;
- struct pid_namespace *ns;
- unsigned long flags;
+ struct pid *pid;
+ struct task_struct *task;
+ struct pid_namespace *ns;
};
static void *timers_start(struct seq_file *m, loff_t *pos)
@@ -2457,54 +2513,48 @@ static void *timers_start(struct seq_file *m, loff_t *pos)
if (!tp->task)
return ERR_PTR(-ESRCH);
- tp->sighand = lock_task_sighand(tp->task, &tp->flags);
- if (!tp->sighand)
- return ERR_PTR(-ESRCH);
-
- return seq_list_start(&tp->task->signal->posix_timers, *pos);
+ rcu_read_lock();
+ return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos);
}
static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
{
struct timers_private *tp = m->private;
- return seq_list_next(v, &tp->task->signal->posix_timers, pos);
+
+ return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos);
}
static void timers_stop(struct seq_file *m, void *v)
{
struct timers_private *tp = m->private;
- if (tp->sighand) {
- unlock_task_sighand(tp->task, &tp->flags);
- tp->sighand = NULL;
- }
-
if (tp->task) {
put_task_struct(tp->task);
tp->task = NULL;
+ rcu_read_unlock();
}
}
static int show_timer(struct seq_file *m, void *v)
{
- struct k_itimer *timer;
- struct timers_private *tp = m->private;
- int notify;
static const char * const nstr[] = {
- [SIGEV_SIGNAL] = "signal",
- [SIGEV_NONE] = "none",
- [SIGEV_THREAD] = "thread",
+ [SIGEV_SIGNAL] = "signal",
+ [SIGEV_NONE] = "none",
+ [SIGEV_THREAD] = "thread",
};
- timer = list_entry((struct list_head *)v, struct k_itimer, list);
- notify = timer->it_sigev_notify;
+ struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
+ struct timers_private *tp = m->private;
+ int notify = timer->it_sigev_notify;
+
+ guard(spinlock_irq)(&timer->it_lock);
+ if (!posixtimer_valid(timer))
+ return 0;
seq_printf(m, "ID: %d\n", timer->it_id);
- seq_printf(m, "signal: %d/%px\n",
- timer->sigq->info.si_signo,
- timer->sigq->info.si_value.sival_ptr);
- seq_printf(m, "notify: %s/%s.%d\n",
- nstr[notify & ~SIGEV_THREAD_ID],
+ seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo,
+ timer->sigq.info.si_value.sival_ptr);
+ seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID],
(notify & SIGEV_THREAD_ID) ? "tid" : "pid",
pid_nr_ns(timer->it_pid, tp->ns));
seq_printf(m, "ClockID: %d\n", timer->it_clock);
@@ -2574,10 +2624,11 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
}
task_lock(p);
- if (slack_ns == 0)
- p->timer_slack_ns = p->default_timer_slack_ns;
- else
- p->timer_slack_ns = slack_ns;
+ if (rt_or_dl_task_policy(p))
+ slack_ns = 0;
+ else if (slack_ns == 0)
+ slack_ns = p->default_timer_slack_ns;
+ p->timer_slack_ns = slack_ns;
task_unlock(p);
out:
@@ -2653,8 +2704,7 @@ static struct dentry *proc_pident_instantiate(struct dentry *dentry,
inode->i_fop = p->fop;
ei->op = p->op;
pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
static struct dentry *proc_pident_lookup(struct inode *dir,
@@ -2730,7 +2780,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
if (!task)
return -ESRCH;
- length = security_getprocattr(task, PROC_I(inode)->op.lsm,
+ length = security_getprocattr(task, PROC_I(inode)->op.lsmid,
file->f_path.dentry->d_name.name,
&p);
put_task_struct(task);
@@ -2788,7 +2838,7 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
if (rv < 0)
goto out_free;
- rv = security_setprocattr(PROC_I(inode)->op.lsm,
+ rv = security_setprocattr(PROC_I(inode)->op.lsmid,
file->f_path.dentry->d_name.name, page,
count);
mutex_unlock(&current->signal->cred_guard_mutex);
@@ -2817,7 +2867,7 @@ static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
\
static const struct file_operations proc_##LSM##_attr_dir_ops = { \
.read = generic_read_dir, \
- .iterate = proc_##LSM##_attr_dir_iterate, \
+ .iterate_shared = proc_##LSM##_attr_dir_iterate, \
.llseek = default_llseek, \
}; \
\
@@ -2837,27 +2887,27 @@ static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
#ifdef CONFIG_SECURITY_SMACK
static const struct pid_entry smack_attr_dir_stuff[] = {
- ATTR("smack", "current", 0666),
+ ATTR(LSM_ID_SMACK, "current", 0666),
};
LSM_DIR_OPS(smack);
#endif
#ifdef CONFIG_SECURITY_APPARMOR
static const struct pid_entry apparmor_attr_dir_stuff[] = {
- ATTR("apparmor", "current", 0666),
- ATTR("apparmor", "prev", 0444),
- ATTR("apparmor", "exec", 0666),
+ ATTR(LSM_ID_APPARMOR, "current", 0666),
+ ATTR(LSM_ID_APPARMOR, "prev", 0444),
+ ATTR(LSM_ID_APPARMOR, "exec", 0666),
};
LSM_DIR_OPS(apparmor);
#endif
static const struct pid_entry attr_dir_stuff[] = {
- ATTR(NULL, "current", 0666),
- ATTR(NULL, "prev", 0444),
- ATTR(NULL, "exec", 0666),
- ATTR(NULL, "fscreate", 0666),
- ATTR(NULL, "keycreate", 0666),
- ATTR(NULL, "sockcreate", 0666),
+ ATTR(LSM_ID_UNDEF, "current", 0666),
+ ATTR(LSM_ID_UNDEF, "prev", 0444),
+ ATTR(LSM_ID_UNDEF, "exec", 0666),
+ ATTR(LSM_ID_UNDEF, "fscreate", 0666),
+ ATTR(LSM_ID_UNDEF, "keycreate", 0666),
+ ATTR(LSM_ID_UNDEF, "sockcreate", 0666),
#ifdef CONFIG_SECURITY_SMACK
DIR("smack", 0555,
proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
@@ -2912,8 +2962,10 @@ static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
ret = 0;
mm = get_task_mm(task);
if (mm) {
+ unsigned long flags = __mm_flags_get_dumpable(mm);
+
len = snprintf(buffer, sizeof(buffer), "%08lx\n",
- ((mm->flags & MMF_DUMP_FILTER_MASK) >>
+ ((flags & MMF_DUMP_FILTER_MASK) >>
MMF_DUMP_FILTER_SHIFT));
mmput(mm);
ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
@@ -2952,9 +3004,9 @@ static ssize_t proc_coredump_filter_write(struct file *file,
for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
if (val & mask)
- set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+ mm_flags_set(i + MMF_DUMP_FILTER_SHIFT, mm);
else
- clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+ mm_flags_clear(i + MMF_DUMP_FILTER_SHIFT, mm);
}
mmput(mm);
@@ -2976,8 +3028,7 @@ static const struct file_operations proc_coredump_filter_operations = {
#ifdef CONFIG_TASK_IO_ACCOUNTING
static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
{
- struct task_io_accounting acct = task->ioac;
- unsigned long flags;
+ struct task_io_accounting acct;
int result;
result = down_read_killable(&task->signal->exec_update_lock);
@@ -2989,15 +3040,21 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
goto out_unlock;
}
- if (whole && lock_task_sighand(task, &flags)) {
- struct task_struct *t = task;
+ if (whole) {
+ struct signal_struct *sig = task->signal;
+ struct task_struct *t;
- task_io_accounting_add(&acct, &task->signal->ioac);
- while_each_thread(task, t)
- task_io_accounting_add(&acct, &t->ioac);
+ guard(rcu)();
+ scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) {
+ acct = sig->ioac;
+ __for_each_thread(sig, t)
+ task_io_accounting_add(&acct, &t->ioac);
- unlock_task_sighand(task, &flags);
+ }
+ } else {
+ acct = task->ioac;
}
+
seq_printf(m,
"rchar: %llu\n"
"wchar: %llu\n"
@@ -3203,12 +3260,24 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
struct mm_struct *mm;
+ int ret = 0;
mm = get_task_mm(task);
if (mm) {
seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
+ seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm));
seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
+ seq_printf(m, "ksm_merge_any: %s\n",
+ mm_flags_test(MMF_VM_MERGE_ANY, mm) ? "yes" : "no");
+ ret = mmap_read_lock_killable(mm);
+ if (ret) {
+ mmput(mm);
+ return ret;
+ }
+ seq_printf(m, "ksm_mergeable: %s\n",
+ ksm_process_mergeable(mm) ? "yes" : "no");
+ mmap_read_unlock(mm);
mmput(mm);
}
@@ -3216,7 +3285,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
}
#endif /* CONFIG_KSM */
-#ifdef CONFIG_STACKLEAK_METRICS
+#ifdef CONFIG_KSTACK_ERASE_METRICS
static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
@@ -3229,7 +3298,7 @@ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
prev_depth, depth);
return 0;
}
-#endif /* CONFIG_STACKLEAK_METRICS */
+#endif /* CONFIG_KSTACK_ERASE_METRICS */
/*
* Thread groups
@@ -3251,9 +3320,7 @@ static const struct pid_entry tgid_base_stuff[] = {
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
ONE("limits", S_IRUGO, proc_pid_limits),
-#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
-#endif
#ifdef CONFIG_SCHED_AUTOGROUP
REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
#endif
@@ -3338,7 +3405,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_LIVEPATCH
ONE("patch_state", S_IRUSR, proc_pid_patch_state),
#endif
-#ifdef CONFIG_STACKLEAK_METRICS
+#ifdef CONFIG_KSTACK_ERASE_METRICS
ONE("stack_depth", S_IRUGO, proc_stack_depth),
#endif
#ifdef CONFIG_PROC_PID_ARCH_STATUS
@@ -3428,8 +3495,7 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry,
set_nlink(inode, nlink_tgid);
pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
@@ -3512,14 +3578,12 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
return 0;
if (pos == TGID_OFFSET - 2) {
- struct inode *inode = d_inode(fs_info->proc_self);
- if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
+ if (!dir_emit(ctx, "self", 4, self_inum, DT_LNK))
return 0;
ctx->pos = pos = pos + 1;
}
if (pos == TGID_OFFSET - 1) {
- struct inode *inode = d_inode(fs_info->proc_thread_self);
- if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
+ if (!dir_emit(ctx, "thread-self", 11, thread_self_inum, DT_LNK))
return 0;
ctx->pos = pos = pos + 1;
}
@@ -3583,7 +3647,8 @@ static int proc_tid_comm_permission(struct mnt_idmap *idmap,
}
static const struct inode_operations proc_tid_comm_inode_operations = {
- .permission = proc_tid_comm_permission,
+ .setattr = proc_setattr,
+ .permission = proc_tid_comm_permission,
};
/*
@@ -3601,9 +3666,7 @@ static const struct pid_entry tid_base_stuff[] = {
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
ONE("limits", S_IRUGO, proc_pid_limits),
-#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
-#endif
NOD("comm", S_IFREG|S_IRUGO|S_IWUSR,
&proc_tid_comm_inode_operations,
&proc_pid_set_comm_operations, {}),
@@ -3732,8 +3795,7 @@ static struct dentry *proc_task_instantiate(struct dentry *dentry,
set_nlink(inode, nlink_tid);
pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
@@ -3813,11 +3875,10 @@ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
/* If we haven't found our starting place yet start
* with the leader and walk nr threads forward.
*/
- pos = task = task->group_leader;
- do {
+ for_each_thread(task, pos) {
if (!nr--)
goto found;
- } while_each_thread(task, pos);
+ }
fail:
pos = NULL;
goto out;
@@ -3839,10 +3900,8 @@ static struct task_struct *next_tid(struct task_struct *start)
struct task_struct *pos = NULL;
rcu_read_lock();
if (pid_alive(start)) {
- pos = next_thread(start);
- if (thread_group_leader(pos))
- pos = NULL;
- else
+ pos = __next_thread(start);
+ if (pos)
get_task_struct(pos);
}
rcu_read_unlock();
@@ -3864,12 +3923,12 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit_dots(file, ctx))
return 0;
- /* f_version caches the tgid value that the last readdir call couldn't
- * return. lseek aka telldir automagically resets f_version to 0.
+ /* We cache the tgid value that the last readdir call couldn't
+ * return and lseek resets it to 0.
*/
ns = proc_pid_ns(inode->i_sb);
- tid = (int)file->f_version;
- file->f_version = 0;
+ tid = (int)(intptr_t)file->private_data;
+ file->private_data = NULL;
for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
task;
task = next_tid(task), ctx->pos++) {
@@ -3879,12 +3938,12 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
tid = task_pid_nr_ns(task, ns);
if (!tid)
continue; /* The task has just exited. */
- len = snprintf(name, sizeof(name), "%u", tid);
+ len = snprintf(name, sizeof(name), "%d", tid);
if (!proc_fill_cache(file, ctx, name, len,
proc_task_instantiate, task, NULL)) {
/* returning this tgid failed, save it as the first
* pid for the next readir call */
- file->f_version = (u64)tid;
+ file->private_data = (void *)(intptr_t)tid;
put_task_struct(task);
break;
}
@@ -3899,7 +3958,7 @@ static int proc_task_getattr(struct mnt_idmap *idmap,
{
struct inode *inode = d_inode(path->dentry);
struct task_struct *p = get_proc_task(inode);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (p) {
stat->nlink += get_nr_threads(p);
@@ -3909,6 +3968,24 @@ static int proc_task_getattr(struct mnt_idmap *idmap,
return 0;
}
+/*
+ * proc_task_readdir() set @file->private_data to a positive integer
+ * value, so casting that to u64 is safe. generic_llseek_cookie() will
+ * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is
+ * here to catch any unexpected change in behavior either in
+ * proc_task_readdir() or generic_llseek_cookie().
+ */
+static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ u64 cookie = (u64)(intptr_t)file->private_data;
+ loff_t off;
+
+ off = generic_llseek_cookie(file, offset, whence, &cookie);
+ WARN_ON_ONCE(cookie > INT_MAX);
+ file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */
+ return off;
+}
+
static const struct inode_operations proc_task_inode_operations = {
.lookup = proc_task_lookup,
.getattr = proc_task_getattr,
@@ -3919,7 +3996,7 @@ static const struct inode_operations proc_task_inode_operations = {
static const struct file_operations proc_task_operations = {
.read = generic_read_dir,
.iterate_shared = proc_task_readdir,
- .llseek = generic_file_llseek,
+ .llseek = proc_dir_llseek,
};
void __init set_proc_pid_nlink(void)
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index 2e244ada1f97..87dcaae32ff8 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -63,6 +63,12 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size)
dst += ret;
}
}
+ if (cmdline_has_extra_options() && ret >= 0 && boot_command_line[0]) {
+ ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n",
+ boot_command_line);
+ if (ret > 0)
+ dst += ret;
+ }
out:
kfree(key);
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index e0758fe7936d..b7cab1ad990d 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -21,6 +21,7 @@ static int show_console_dev(struct seq_file *m, void *v)
{ CON_ENABLED, 'E' },
{ CON_CONSDEV, 'C' },
{ CON_BOOT, 'B' },
+ { CON_NBCON, 'N' },
{ CON_PRINTBUFFER, 'p' },
{ CON_BRL, 'b' },
{ CON_ANYTIME, 'a' },
@@ -58,8 +59,8 @@ static int show_console_dev(struct seq_file *m, void *v)
seq_printf(m, "%s%d", con->name, con->index);
seq_pad(m, ' ');
seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-',
- con->write ? 'W' : '-', con->unblank ? 'U' : '-',
- flags);
+ ((con->flags & CON_NBCON) || con->write) ? 'W' : '-',
+ con->unblank ? 'U' : '-', flags);
if (dev)
seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
@@ -68,6 +69,7 @@ static int show_console_dev(struct seq_file *m, void *v)
}
static void *c_start(struct seq_file *m, loff_t *pos)
+ __acquires(&console_mutex)
{
struct console *con;
loff_t off = 0;
@@ -94,6 +96,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
}
static void c_stop(struct seq_file *m, void *v)
+ __releases(&console_mutex)
{
console_list_unlock();
}
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index b3140deebbbf..9eeccff49b2a 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -39,10 +39,8 @@ static int seq_show(struct seq_file *m, void *v)
spin_lock(&files->file_lock);
file = files_lookup_fd_locked(files, fd);
if (file) {
- struct fdtable *fdt = files_fdtable(files);
-
f_flags = file->f_flags;
- if (close_on_exec(fd, fdt))
+ if (close_on_exec(fd, files))
f_flags |= O_CLOEXEC;
get_file(file);
@@ -61,7 +59,7 @@ static int seq_show(struct seq_file *m, void *v)
real_mount(file->f_path.mnt)->mnt_id,
file_inode(file)->i_ino);
- /* show_fd_locks() never deferences files so a stale value is safe */
+ /* show_fd_locks() never dereferences files, so a stale value is safe */
show_fd_locks(m, file, files);
if (seq_has_overflowed(m))
goto out;
@@ -74,7 +72,18 @@ out:
return 0;
}
-static int proc_fdinfo_access_allowed(struct inode *inode)
+static int seq_fdinfo_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, seq_show, inode);
+}
+
+/*
+ * Shared /proc/pid/fdinfo and /proc/pid/fdinfo/fd permission helper to ensure
+ * that the current task has PTRACE_MODE_READ in addition to the normal
+ * POSIX-like checks.
+ */
+static int proc_fdinfo_permission(struct mnt_idmap *idmap, struct inode *inode,
+ int mask)
{
bool allowed = false;
struct task_struct *task = get_proc_task(inode);
@@ -88,18 +97,13 @@ static int proc_fdinfo_access_allowed(struct inode *inode)
if (!allowed)
return -EACCES;
- return 0;
+ return generic_permission(idmap, inode, mask);
}
-static int seq_fdinfo_open(struct inode *inode, struct file *file)
-{
- int ret = proc_fdinfo_access_allowed(inode);
-
- if (ret)
- return ret;
-
- return single_open(file, seq_show, inode);
-}
+static const struct inode_operations proc_fdinfo_file_inode_operations = {
+ .permission = proc_fdinfo_permission,
+ .setattr = proc_setattr,
+};
static const struct file_operations proc_fdinfo_file_operations = {
.open = seq_fdinfo_open,
@@ -112,11 +116,11 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
{
struct file *file;
- rcu_read_lock();
- file = task_lookup_fd_rcu(task, fd);
- if (file)
+ file = fget_task(task, fd);
+ if (file) {
*mode = file->f_mode;
- rcu_read_unlock();
+ fput(file);
+ }
return !!file;
}
@@ -136,7 +140,8 @@ static void tid_fd_update_inode(struct task_struct *task, struct inode *inode,
security_task_to_inode(task, inode);
}
-static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
+static int tid_fd_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
struct task_struct *task;
struct inode *inode;
@@ -214,8 +219,8 @@ static struct dentry *proc_fd_instantiate(struct dentry *dentry,
ei->op.proc_get_link = proc_fd_link;
tid_fd_update_inode(task, inode, data->mode);
- d_set_d_op(dentry, &tid_fd_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return proc_splice_unmountable(inode, dentry,
+ &tid_fd_dentry_operations);
}
static struct dentry *proc_lookupfd_common(struct inode *dir,
@@ -252,30 +257,27 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
if (!dir_emit_dots(file, ctx))
goto out;
- rcu_read_lock();
for (fd = ctx->pos - 2;; fd++) {
struct file *f;
struct fd_data data;
char name[10 + 1];
unsigned int len;
- f = task_lookup_next_fd_rcu(p, &fd);
+ f = fget_task_next(p, &fd);
ctx->pos = fd + 2LL;
if (!f)
break;
data.mode = f->f_mode;
- rcu_read_unlock();
+ fput(f);
data.fd = fd;
len = snprintf(name, sizeof(name), "%u", fd);
if (!proc_fill_cache(file, ctx,
name, len, instantiate, p,
&data))
- goto out;
+ break;
cond_resched();
- rcu_read_lock();
}
- rcu_read_unlock();
out:
put_task_struct(p);
return 0;
@@ -305,14 +307,14 @@ static int proc_readfd_count(struct inode *inode, loff_t *count)
return 0;
}
-static int proc_readfd(struct file *file, struct dir_context *ctx)
+static int proc_fd_iterate(struct file *file, struct dir_context *ctx)
{
return proc_readfd_common(file, ctx, proc_fd_instantiate);
}
const struct file_operations proc_fd_operations = {
.read = generic_read_dir,
- .iterate_shared = proc_readfd,
+ .iterate_shared = proc_fd_iterate,
.llseek = generic_file_llseek,
};
@@ -350,18 +352,9 @@ static int proc_fd_getattr(struct mnt_idmap *idmap,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
- int rv = 0;
-
- generic_fillattr(&nop_mnt_idmap, inode, stat);
-
- /* If it's a directory, put the number of open fds there */
- if (S_ISDIR(inode->i_mode)) {
- rv = proc_readfd_count(inode, &stat->size);
- if (rv < 0)
- return rv;
- }
- return rv;
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ return proc_readfd_count(inode, &stat->size);
}
const struct inode_operations proc_fd_inode_operations = {
@@ -385,11 +378,13 @@ static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry,
ei = PROC_I(inode);
ei->fd = data->fd;
+ inode->i_op = &proc_fdinfo_file_inode_operations;
+
inode->i_fop = &proc_fdinfo_file_operations;
tid_fd_update_inode(task, inode, 0);
- d_set_d_op(dentry, &tid_fd_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return proc_splice_unmountable(inode, dentry,
+ &tid_fd_dentry_operations);
}
static struct dentry *
@@ -398,30 +393,20 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
}
-static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
+static int proc_fdinfo_iterate(struct file *file, struct dir_context *ctx)
{
return proc_readfd_common(file, ctx,
proc_fdinfo_instantiate);
}
-static int proc_open_fdinfo(struct inode *inode, struct file *file)
-{
- int ret = proc_fdinfo_access_allowed(inode);
-
- if (ret)
- return ret;
-
- return 0;
-}
-
const struct inode_operations proc_fdinfo_inode_operations = {
.lookup = proc_lookupfdinfo,
+ .permission = proc_fdinfo_permission,
.setattr = proc_setattr,
};
const struct file_operations proc_fdinfo_operations = {
- .open = proc_open_fdinfo,
.read = generic_read_dir,
- .iterate_shared = proc_readfdinfo,
+ .iterate_shared = proc_fdinfo_iterate,
.llseek = generic_file_llseek,
};
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 42ae38ff6e7e..501889856461 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -146,7 +146,7 @@ static int proc_getattr(struct mnt_idmap *idmap,
}
}
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
return 0;
}
@@ -202,8 +202,8 @@ int proc_alloc_inum(unsigned int *inum)
{
int i;
- i = ida_simple_get(&proc_inum_ida, 0, UINT_MAX - PROC_DYNAMIC_FIRST + 1,
- GFP_KERNEL);
+ i = ida_alloc_max(&proc_inum_ida, UINT_MAX - PROC_DYNAMIC_FIRST,
+ GFP_KERNEL);
if (i < 0)
return i;
@@ -213,10 +213,11 @@ int proc_alloc_inum(unsigned int *inum)
void proc_free_inum(unsigned int inum)
{
- ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
+ ida_free(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
}
-static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int proc_misc_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -253,8 +254,11 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
inode = proc_get_inode(dir->i_sb, de);
if (!inode)
return ERR_PTR(-ENOMEM);
- d_set_d_op(dentry, de->proc_dops);
- return d_splice_alias(inode, dentry);
+ if (de->flags & PROC_ENTRY_FORCE_LOOKUP)
+ return d_splice_alias_ops(inode, dentry,
+ &proc_net_dentry_ops);
+ return d_splice_alias_ops(inode, dentry,
+ &proc_misc_dentry_ops);
}
read_unlock(&proc_subdir_lock);
return ERR_PTR(-ENOENT);
@@ -343,7 +347,8 @@ static const struct file_operations proc_dir_operations = {
.iterate_shared = proc_readdir,
};
-static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int proc_net_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
return 0;
}
@@ -362,6 +367,25 @@ static const struct inode_operations proc_dir_inode_operations = {
.setattr = proc_notify_change,
};
+static void pde_set_flags(struct proc_dir_entry *pde)
+{
+ const struct proc_ops *proc_ops = pde->proc_ops;
+
+ if (!proc_ops)
+ return;
+
+ if (proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
+ pde->flags |= PROC_ENTRY_PERMANENT;
+ if (proc_ops->proc_read_iter)
+ pde->flags |= PROC_ENTRY_proc_read_iter;
+#ifdef CONFIG_COMPAT
+ if (proc_ops->proc_compat_ioctl)
+ pde->flags |= PROC_ENTRY_proc_compat_ioctl;
+#endif
+ if (proc_ops->proc_lseek)
+ pde->flags |= PROC_ENTRY_proc_lseek;
+}
+
/* returns the registered entry, or frees dp and returns NULL on failure */
struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
struct proc_dir_entry *dp)
@@ -369,6 +393,9 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
if (proc_alloc_inum(&dp->low_ino))
goto out_free_entry;
+ if (!S_ISDIR(dp->mode))
+ pde_set_flags(dp);
+
write_lock(&proc_subdir_lock);
dp->parent = dir;
if (pde_subdir_insert(dir, dp) == false) {
@@ -446,9 +473,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
INIT_LIST_HEAD(&ent->pde_openers);
proc_set_user(ent, (*parent)->uid, (*parent)->gid);
- ent->proc_dops = &proc_misc_dentry_ops;
/* Revalidate everything under /proc/${pid}/net */
- if ((*parent)->proc_dops == &proc_net_dentry_ops)
+ if ((*parent)->flags & PROC_ENTRY_FORCE_LOOKUP)
pde_force_lookup(ent);
out:
@@ -464,9 +490,9 @@ struct proc_dir_entry *proc_symlink(const char *name,
(S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1);
if (ent) {
- ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
+ ent->size = strlen(dest);
+ ent->data = kmemdup(dest, ent->size + 1, GFP_KERNEL);
if (ent->data) {
- strcpy((char*)ent->data,dest);
ent->proc_iops = &proc_link_inode_operations;
ent = proc_register(parent, ent);
} else {
@@ -557,12 +583,6 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
return p;
}
-static inline void pde_set_flags(struct proc_dir_entry *pde)
-{
- if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
- pde->flags |= PROC_ENTRY_PERMANENT;
-}
-
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
struct proc_dir_entry *parent,
const struct proc_ops *proc_ops, void *data)
@@ -573,7 +593,6 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
if (!p)
return NULL;
p->proc_ops = proc_ops;
- pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_data);
@@ -679,6 +698,12 @@ void pde_put(struct proc_dir_entry *pde)
}
}
+static void pde_erase(struct proc_dir_entry *pde, struct proc_dir_entry *parent)
+{
+ rb_erase(&pde->subdir_node, &parent->subdir);
+ RB_CLEAR_NODE(&pde->subdir_node);
+}
+
/*
* Remove a /proc entry and free it if it's not currently in use.
*/
@@ -701,7 +726,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
WARN(1, "removing permanent /proc entry '%s'", de->name);
de = NULL;
} else {
- rb_erase(&de->subdir_node, &parent->subdir);
+ pde_erase(de, parent);
if (S_ISDIR(de->mode))
parent->nlink--;
}
@@ -745,7 +770,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
root->parent->name, root->name);
return -EINVAL;
}
- rb_erase(&root->subdir_node, &parent->subdir);
+ pde_erase(root, parent);
de = root;
while (1) {
@@ -757,7 +782,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
next->parent->name, next->name);
return -EINVAL;
}
- rb_erase(&next->subdir_node, &de->subdir);
+ pde_erase(next, de);
de = next;
continue;
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 67b09a1d9433..b7634f975d98 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -30,7 +30,6 @@
static void proc_evict_inode(struct inode *inode)
{
- struct proc_dir_entry *de;
struct ctl_table_header *head;
struct proc_inode *ei = PROC_I(inode);
@@ -38,21 +37,12 @@ static void proc_evict_inode(struct inode *inode)
clear_inode(inode);
/* Stop tracking associated processes */
- if (ei->pid) {
+ if (ei->pid)
proc_pid_evict_inode(ei);
- ei->pid = NULL;
- }
-
- /* Let go of any associated proc directory entry */
- de = ei->pde;
- if (de) {
- pde_put(de);
- ei->pde = NULL;
- }
head = ei->sysctl;
if (head) {
- RCU_INIT_POINTER(ei->sysctl, NULL);
+ WRITE_ONCE(ei->sysctl, NULL);
proc_sys_evict_inode(inode, head);
}
}
@@ -80,6 +70,13 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
static void proc_free_inode(struct inode *inode)
{
+ struct proc_inode *ei = PROC_I(inode);
+
+ if (ei->pid)
+ put_pid(ei->pid);
+ /* Let go of any associated proc directory entry */
+ if (ei->pde)
+ pde_put(ei->pde);
kmem_cache_free(proc_inode_cachep, PROC_I(inode));
}
@@ -95,7 +92,7 @@ void __init proc_init_kmemcache(void)
proc_inode_cachep = kmem_cache_create("proc_inode_cache",
sizeof(struct proc_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT|
+ SLAB_ACCOUNT|
SLAB_PANIC),
init_once);
pde_opener_cache =
@@ -110,18 +107,15 @@ void __init proc_init_kmemcache(void)
void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
{
- struct inode *inode;
- struct proc_inode *ei;
struct hlist_node *node;
struct super_block *old_sb = NULL;
rcu_read_lock();
- for (;;) {
+ while ((node = hlist_first_rcu(inodes))) {
+ struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes);
struct super_block *sb;
- node = hlist_first_rcu(inodes);
- if (!node)
- break;
- ei = hlist_entry(node, struct proc_inode, sibling_inodes);
+ struct inode *inode;
+
spin_lock(lock);
hlist_del_init_rcu(&ei->sibling_inodes);
spin_unlock(lock);
@@ -193,7 +187,7 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root)
const struct super_operations proc_sops = {
.alloc_inode = proc_alloc_inode,
.free_inode = proc_free_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = proc_evict_inode,
.statfs = simple_statfs,
.show_options = proc_show_options,
@@ -309,9 +303,7 @@ static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter)
static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
- typeof_member(struct proc_ops, proc_read) read;
-
- read = pde->proc_ops->proc_read;
+ const auto read = pde->proc_ops->proc_read;
if (read)
return read(file, buf, count, ppos);
return -EIO;
@@ -333,9 +325,7 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count,
static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
- typeof_member(struct proc_ops, proc_write) write;
-
- write = pde->proc_ops->proc_write;
+ const auto write = pde->proc_ops->proc_write;
if (write)
return write(file, buf, count, ppos);
return -EIO;
@@ -357,9 +347,7 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t
static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts)
{
- typeof_member(struct proc_ops, proc_poll) poll;
-
- poll = pde->proc_ops->proc_poll;
+ const auto poll = pde->proc_ops->proc_poll;
if (poll)
return poll(file, pts);
return DEFAULT_POLLMASK;
@@ -381,9 +369,7 @@ static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
{
- typeof_member(struct proc_ops, proc_ioctl) ioctl;
-
- ioctl = pde->proc_ops->proc_ioctl;
+ const auto ioctl = pde->proc_ops->proc_ioctl;
if (ioctl)
return ioctl(file, cmd, arg);
return -ENOTTY;
@@ -406,9 +392,7 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
#ifdef CONFIG_COMPAT
static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
{
- typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;
-
- compat_ioctl = pde->proc_ops->proc_compat_ioctl;
+ const auto compat_ioctl = pde->proc_ops->proc_compat_ioctl;
if (compat_ioctl)
return compat_ioctl(file, cmd, arg);
return -ENOTTY;
@@ -430,9 +414,7 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned
static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma)
{
- typeof_member(struct proc_ops, proc_mmap) mmap;
-
- mmap = pde->proc_ops->proc_mmap;
+ const auto mmap = pde->proc_ops->proc_mmap;
if (mmap)
return mmap(file, vma);
return -EIO;
@@ -457,15 +439,13 @@ pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned lo
unsigned long len, unsigned long pgoff,
unsigned long flags)
{
- typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;
+ if (pde->proc_ops->proc_get_unmapped_area)
+ return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags);
- get_area = pde->proc_ops->proc_get_unmapped_area;
#ifdef CONFIG_MMU
- if (!get_area)
- get_area = current->mm->get_unmapped_area;
+ return mm_get_unmapped_area(file, orig_addr, len, pgoff, flags);
#endif
- if (get_area)
- return get_area(file, orig_addr, len, pgoff, flags);
+
return orig_addr;
}
@@ -491,10 +471,9 @@ static int proc_reg_open(struct inode *inode, struct file *file)
struct proc_dir_entry *pde = PDE(inode);
int rv = 0;
typeof_member(struct proc_ops, proc_open) open;
- typeof_member(struct proc_ops, proc_release) release;
struct pde_opener *pdeo;
- if (!pde->proc_ops->proc_lseek)
+ if (!pde_has_proc_lseek(pde))
file->f_mode &= ~FMODE_LSEEK;
if (pde_is_permanent(pde)) {
@@ -518,7 +497,7 @@ static int proc_reg_open(struct inode *inode, struct file *file)
if (!use_pde(pde))
return -ENOENT;
- release = pde->proc_ops->proc_release;
+ const auto release = pde->proc_ops->proc_release;
if (release) {
pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL);
if (!pdeo) {
@@ -555,12 +534,9 @@ static int proc_reg_release(struct inode *inode, struct file *file)
struct pde_opener *pdeo;
if (pde_is_permanent(pde)) {
- typeof_member(struct proc_ops, proc_release) release;
-
- release = pde->proc_ops->proc_release;
- if (release) {
+ const auto release = pde->proc_ops->proc_release;
+ if (release)
return release(inode, file);
- }
return 0;
}
@@ -660,7 +636,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
inode->i_private = de->data;
inode->i_ino = de->low_ino;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ simple_inode_init_ts(inode);
PROC_I(inode)->pde = de;
if (is_empty_pde(de)) {
make_empty_dir_inode(inode);
@@ -679,13 +655,13 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
if (S_ISREG(inode->i_mode)) {
inode->i_op = de->proc_iops;
- if (de->proc_ops->proc_read_iter)
+ if (pde_has_proc_read_iter(de))
inode->i_fop = &proc_iter_file_ops;
else
inode->i_fop = &proc_reg_file_ops;
#ifdef CONFIG_COMPAT
- if (de->proc_ops->proc_compat_ioctl) {
- if (de->proc_ops->proc_read_iter)
+ if (pde_has_proc_compat_ioctl(de)) {
+ if (pde_has_proc_read_iter(de))
inode->i_fop = &proc_iter_file_ops_compat;
else
inode->i_fop = &proc_reg_file_ops_compat;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9dda7e54b2d0..c1e8eb984da8 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -13,6 +13,7 @@
#include <linux/binfmts.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
+#include <linux/mm.h>
struct ctl_table_header;
struct mempolicy;
@@ -43,7 +44,6 @@ struct proc_dir_entry {
const struct proc_ops *proc_ops;
const struct file_operations *proc_dir_ops;
};
- const struct dentry_operations *proc_dops;
union {
const struct seq_operations *seq_ops;
int (*single_show)(struct seq_file *, void *);
@@ -84,6 +84,25 @@ static inline void pde_make_permanent(struct proc_dir_entry *pde)
pde->flags |= PROC_ENTRY_PERMANENT;
}
+static inline bool pde_has_proc_read_iter(const struct proc_dir_entry *pde)
+{
+ return pde->flags & PROC_ENTRY_proc_read_iter;
+}
+
+static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde)
+{
+#ifdef CONFIG_COMPAT
+ return pde->flags & PROC_ENTRY_proc_compat_ioctl;
+#else
+ return false;
+#endif
+}
+
+static inline bool pde_has_proc_lseek(const struct proc_dir_entry *pde)
+{
+ return pde->flags & PROC_ENTRY_proc_lseek;
+}
+
extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);
@@ -92,7 +111,7 @@ union proc_op {
int (*proc_show)(struct seq_file *m,
struct pid_namespace *ns, struct pid *pid,
struct task_struct *task);
- const char *lsm;
+ int lsmid;
};
struct proc_inode {
@@ -101,7 +120,7 @@ struct proc_inode {
union proc_op op;
struct proc_dir_entry *pde;
struct ctl_table_header *sysctl;
- struct ctl_table *sysctl_entry;
+ const struct ctl_table *sysctl_entry;
struct hlist_node sibling_inodes;
const struct proc_ns_operations *ns_ops;
struct inode vfs_inode;
@@ -142,6 +161,80 @@ unsigned name_to_int(const struct qstr *qstr);
/* Worst case buffer size needed for holding an integer. */
#define PROC_NUMBUF 13
+#ifdef CONFIG_PAGE_MAPCOUNT
+/**
+ * folio_precise_page_mapcount() - Number of mappings of this folio page.
+ * @folio: The folio.
+ * @page: The page.
+ *
+ * The number of present user page table entries that reference this page
+ * as tracked via the RMAP: either referenced directly (PTE) or as part of
+ * a larger area that covers this page (e.g., PMD).
+ *
+ * Use this function only for the calculation of existing statistics
+ * (USS, PSS, mapcount_max) and for debugging purposes (/proc/kpagecount).
+ *
+ * Do not add new users.
+ *
+ * Returns: The number of mappings of this folio page. 0 for
+ * folios that are not mapped to user space or are not tracked via the RMAP
+ * (e.g., shared zeropage).
+ */
+static inline int folio_precise_page_mapcount(struct folio *folio,
+ struct page *page)
+{
+ int mapcount = atomic_read(&page->_mapcount) + 1;
+
+ if (page_mapcount_is_type(mapcount))
+ mapcount = 0;
+ if (folio_test_large(folio))
+ mapcount += folio_entire_mapcount(folio);
+
+ return mapcount;
+}
+#else /* !CONFIG_PAGE_MAPCOUNT */
+static inline int folio_precise_page_mapcount(struct folio *folio,
+ struct page *page)
+{
+ BUILD_BUG();
+}
+#endif /* CONFIG_PAGE_MAPCOUNT */
+
+/**
+ * folio_average_page_mapcount() - Average number of mappings per page in this
+ * folio
+ * @folio: The folio.
+ *
+ * The average number of user page table entries that reference each page in
+ * this folio as tracked via the RMAP: either referenced directly (PTE) or
+ * as part of a larger area that covers this page (e.g., PMD).
+ *
+ * The average is calculated by rounding to the nearest integer; however,
+ * to avoid duplicated code in current callers, the average is at least
+ * 1 if any page of the folio is mapped.
+ *
+ * Returns: The average number of mappings per page in this folio.
+ */
+static inline int folio_average_page_mapcount(struct folio *folio)
+{
+ int mapcount, entire_mapcount, avg;
+
+ if (!folio_test_large(folio))
+ return atomic_read(&folio->_mapcount) + 1;
+
+ mapcount = folio_large_mapcount(folio);
+ if (unlikely(mapcount <= 0))
+ return 0;
+ entire_mapcount = folio_entire_mapcount(folio);
+ if (mapcount <= entire_mapcount)
+ return entire_mapcount;
+ mapcount -= entire_mapcount;
+
+ /* Round to closest integer ... */
+ avg = ((unsigned int)mapcount + folio_large_nr_pages(folio) / 2) >> folio_large_order(folio);
+ /* ... but return at least 1. */
+ return max_t(int, avg + entire_mapcount, 1);
+}
/*
* array.c
*/
@@ -280,18 +373,27 @@ static inline void proc_tty_init(void) {}
extern struct proc_dir_entry proc_root;
extern void proc_self_init(void);
+extern unsigned self_inum, thread_self_inum;
/*
* task_[no]mmu.c
*/
struct mem_size_stats;
+
+struct proc_maps_locking_ctx {
+ struct mm_struct *mm;
+#ifdef CONFIG_PER_VMA_LOCK
+ bool mmap_locked;
+ struct vm_area_struct *locked_vma;
+#endif
+};
+
struct proc_maps_private {
struct inode *inode;
struct task_struct *task;
- struct mm_struct *mm;
-#ifdef CONFIG_MMU
struct vma_iterator iter;
-#endif
+ loff_t last_pos;
+ struct proc_maps_locking_ctx lock_ctx;
#ifdef CONFIG_NUMA
struct mempolicy *task_mempolicy;
#endif
@@ -316,5 +418,17 @@ extern const struct dentry_operations proc_net_dentry_ops;
static inline void pde_force_lookup(struct proc_dir_entry *pde)
{
/* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
- pde->proc_dops = &proc_net_dentry_ops;
+ pde->flags |= PROC_ENTRY_FORCE_LOOKUP;
+}
+
+/*
+ * Add a new procfs dentry that can't serve as a mountpoint. That should
+ * encompass anything that is ephemeral and can just disappear while the
+ * process is still around.
+ */
+static inline struct dentry *proc_splice_unmountable(struct inode *inode,
+ struct dentry *dentry, const struct dentry_operations *d_ops)
+{
+ dont_mount(dentry);
+ return d_splice_alias_ops(inode, dentry, d_ops);
}
diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c
index cb0edc7cbf09..714a22ded8a8 100644
--- a/fs/proc/interrupts.c
+++ b/fs/proc/interrupts.c
@@ -11,13 +11,13 @@
*/
static void *int_seq_start(struct seq_file *f, loff_t *pos)
{
- return (*pos <= nr_irqs) ? pos : NULL;
+ return *pos <= irq_get_nr_irqs() ? pos : NULL;
}
static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
{
(*pos)++;
- if (*pos > nr_irqs)
+ if (*pos > irq_get_nr_irqs())
return NULL;
return pos;
}
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 9cb32e1a78a0..728630b10fdf 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -10,7 +10,7 @@
* Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com>
*/
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
#include <linux/mm.h>
#include <linux/proc_fs.h>
#include <linux/kcore.h>
@@ -34,8 +34,6 @@
#include <asm/sections.h>
#include "internal.h"
-#define CORE_STR "CORE"
-
#ifndef ELF_CORE_EFLAGS
#define ELF_CORE_EFLAGS 0
#endif
@@ -50,8 +48,26 @@ static struct proc_dir_entry *proc_root_kcore;
#define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
#endif
+#ifndef kc_xlate_dev_mem_ptr
+#define kc_xlate_dev_mem_ptr kc_xlate_dev_mem_ptr
+static inline void *kc_xlate_dev_mem_ptr(phys_addr_t phys)
+{
+ return __va(phys);
+}
+#endif
+#ifndef kc_unxlate_dev_mem_ptr
+#define kc_unxlate_dev_mem_ptr kc_unxlate_dev_mem_ptr
+static inline void kc_unxlate_dev_mem_ptr(phys_addr_t phys, void *virt)
+{
+}
+#endif
+
static LIST_HEAD(kclist_head);
-static DECLARE_RWSEM(kclist_lock);
+static int kcore_nphdr;
+static size_t kcore_phdrs_len;
+static size_t kcore_notes_len;
+static size_t kcore_data_offset;
+DEFINE_STATIC_PERCPU_RWSEM(kclist_lock);
static int kcore_need_update = 1;
/*
@@ -87,33 +103,34 @@ void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
list_add_tail(&new->list, &kclist_head);
}
-static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len,
- size_t *data_offset)
+static void update_kcore_size(void)
{
size_t try, size;
struct kcore_list *m;
- *nphdr = 1; /* PT_NOTE */
+ kcore_nphdr = 1; /* PT_NOTE */
size = 0;
list_for_each_entry(m, &kclist_head, list) {
try = kc_vaddr_to_offset((size_t)m->addr + m->size);
if (try > size)
size = try;
- *nphdr = *nphdr + 1;
+ kcore_nphdr++;
}
- *phdrs_len = *nphdr * sizeof(struct elf_phdr);
- *notes_len = (4 * sizeof(struct elf_note) +
- 3 * ALIGN(sizeof(CORE_STR), 4) +
- VMCOREINFO_NOTE_NAME_BYTES +
- ALIGN(sizeof(struct elf_prstatus), 4) +
- ALIGN(sizeof(struct elf_prpsinfo), 4) +
- ALIGN(arch_task_struct_size, 4) +
- ALIGN(vmcoreinfo_size, 4));
- *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
- *notes_len);
- return *data_offset + size;
+ kcore_phdrs_len = kcore_nphdr * sizeof(struct elf_phdr);
+ kcore_notes_len = (4 * sizeof(struct elf_note) +
+ ALIGN(sizeof(NN_PRSTATUS), 4) +
+ ALIGN(sizeof(NN_PRPSINFO), 4) +
+ ALIGN(sizeof(NN_TASKSTRUCT), 4) +
+ VMCOREINFO_NOTE_NAME_BYTES +
+ ALIGN(sizeof(struct elf_prstatus), 4) +
+ ALIGN(sizeof(struct elf_prpsinfo), 4) +
+ ALIGN(arch_task_struct_size, 4) +
+ ALIGN(vmcoreinfo_size, 4));
+ kcore_data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + kcore_phdrs_len +
+ kcore_notes_len);
+ proc_root_kcore->size = kcore_data_offset + size;
}
#ifdef CONFIG_HIGHMEM
@@ -235,7 +252,7 @@ static int kcore_ram_list(struct list_head *list)
int nid, ret;
unsigned long end_pfn;
- /* Not inialized....update now */
+ /* Not initialized....update now */
/* find out "max pfn" */
end_pfn = 0;
for_each_node_state(nid, N_MEMORY) {
@@ -256,12 +273,10 @@ static int kcore_update_ram(void)
{
LIST_HEAD(list);
LIST_HEAD(garbage);
- int nphdr;
- size_t phdrs_len, notes_len, data_offset;
struct kcore_list *tmp, *pos;
int ret = 0;
- down_write(&kclist_lock);
+ percpu_down_write(&kclist_lock);
if (!xchg(&kcore_need_update, 0))
goto out;
@@ -279,11 +294,10 @@ static int kcore_update_ram(void)
}
list_splice_tail(&list, &kclist_head);
- proc_root_kcore->size = get_kcore_size(&nphdr, &phdrs_len, &notes_len,
- &data_offset);
+ update_kcore_size();
out:
- up_write(&kclist_lock);
+ percpu_up_write(&kclist_lock);
list_for_each_entry_safe(pos, tmp, &garbage, list) {
list_del(&pos->list);
kfree(pos);
@@ -309,28 +323,27 @@ static void append_kcore_note(char *notes, size_t *i, const char *name,
static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
{
+ struct file *file = iocb->ki_filp;
+ char *buf = file->private_data;
loff_t *fpos = &iocb->ki_pos;
- size_t phdrs_offset, notes_offset, data_offset;
+ size_t phdrs_offset, notes_offset;
size_t page_offline_frozen = 1;
- size_t phdrs_len, notes_len;
struct kcore_list *m;
size_t tsz;
- int nphdr;
unsigned long start;
size_t buflen = iov_iter_count(iter);
size_t orig_buflen = buflen;
int ret = 0;
- down_read(&kclist_lock);
+ percpu_down_read(&kclist_lock);
/*
* Don't race against drivers that set PageOffline() and expect no
* further page access.
*/
page_offline_freeze();
- get_kcore_size(&nphdr, &phdrs_len, &notes_len, &data_offset);
phdrs_offset = sizeof(struct elfhdr);
- notes_offset = phdrs_offset + phdrs_len;
+ notes_offset = phdrs_offset + kcore_phdrs_len;
/* ELF file header. */
if (buflen && *fpos < sizeof(struct elfhdr)) {
@@ -352,7 +365,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
.e_flags = ELF_CORE_EFLAGS,
.e_ehsize = sizeof(struct elfhdr),
.e_phentsize = sizeof(struct elf_phdr),
- .e_phnum = nphdr,
+ .e_phnum = kcore_nphdr,
};
tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos);
@@ -366,10 +379,10 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
}
/* ELF program headers. */
- if (buflen && *fpos < phdrs_offset + phdrs_len) {
+ if (buflen && *fpos < phdrs_offset + kcore_phdrs_len) {
struct elf_phdr *phdrs, *phdr;
- phdrs = kzalloc(phdrs_len, GFP_KERNEL);
+ phdrs = kzalloc(kcore_phdrs_len, GFP_KERNEL);
if (!phdrs) {
ret = -ENOMEM;
goto out;
@@ -377,13 +390,14 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
phdrs[0].p_type = PT_NOTE;
phdrs[0].p_offset = notes_offset;
- phdrs[0].p_filesz = notes_len;
+ phdrs[0].p_filesz = kcore_notes_len;
phdr = &phdrs[1];
list_for_each_entry(m, &kclist_head, list) {
phdr->p_type = PT_LOAD;
phdr->p_flags = PF_R | PF_W | PF_X;
- phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset;
+ phdr->p_offset = kc_vaddr_to_offset(m->addr)
+ + kcore_data_offset;
phdr->p_vaddr = (size_t)m->addr;
if (m->type == KCORE_RAM)
phdr->p_paddr = __pa(m->addr);
@@ -396,7 +410,8 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
phdr++;
}
- tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos);
+ tsz = min_t(size_t, buflen,
+ phdrs_offset + kcore_phdrs_len - *fpos);
if (copy_to_iter((char *)phdrs + *fpos - phdrs_offset, tsz,
iter) != tsz) {
kfree(phdrs);
@@ -410,7 +425,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
}
/* ELF note segment. */
- if (buflen && *fpos < notes_offset + notes_len) {
+ if (buflen && *fpos < notes_offset + kcore_notes_len) {
struct elf_prstatus prstatus = {};
struct elf_prpsinfo prpsinfo = {
.pr_sname = 'R',
@@ -422,17 +437,17 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
strscpy(prpsinfo.pr_psargs, saved_command_line,
sizeof(prpsinfo.pr_psargs));
- notes = kzalloc(notes_len, GFP_KERNEL);
+ notes = kzalloc(kcore_notes_len, GFP_KERNEL);
if (!notes) {
ret = -ENOMEM;
goto out;
}
- append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus,
+ append_kcore_note(notes, &i, NN_PRSTATUS, NT_PRSTATUS, &prstatus,
sizeof(prstatus));
- append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo,
+ append_kcore_note(notes, &i, NN_PRPSINFO, NT_PRPSINFO, &prpsinfo,
sizeof(prpsinfo));
- append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current,
+ append_kcore_note(notes, &i, NN_TASKSTRUCT, NT_TASKSTRUCT, current,
arch_task_struct_size);
/*
* vmcoreinfo_size is mostly constant after init time, but it
@@ -443,9 +458,10 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
*/
append_kcore_note(notes, &i, VMCOREINFO_NOTE_NAME, 0,
vmcoreinfo_data,
- min(vmcoreinfo_size, notes_len - i));
+ min(vmcoreinfo_size, kcore_notes_len - i));
- tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
+ tsz = min_t(size_t, buflen,
+ notes_offset + kcore_notes_len - *fpos);
if (copy_to_iter(notes + *fpos - notes_offset, tsz, iter) != tsz) {
kfree(notes);
ret = -EFAULT;
@@ -461,7 +477,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
* Check to see if our file offset matches with any of
* the addresses in the elf_phdr on our list.
*/
- start = kc_offset_to_vaddr(*fpos - data_offset);
+ start = kc_offset_to_vaddr(*fpos - kcore_data_offset);
if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
tsz = buflen;
@@ -469,19 +485,21 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
while (buflen) {
struct page *page;
unsigned long pfn;
+ phys_addr_t phys;
+ void *__start;
/*
* If this is the first iteration or the address is not within
* the previous entry, search for a matching entry.
*/
if (!m || start < m->addr || start >= m->addr + m->size) {
- struct kcore_list *iter;
+ struct kcore_list *pos;
m = NULL;
- list_for_each_entry(iter, &kclist_head, list) {
- if (start >= iter->addr &&
- start < iter->addr + iter->size) {
- m = iter;
+ list_for_each_entry(pos, &kclist_head, list) {
+ if (start >= pos->addr &&
+ start < pos->addr + pos->size) {
+ m = pos;
break;
}
}
@@ -535,7 +553,8 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
}
break;
case KCORE_RAM:
- pfn = __pa(start) >> PAGE_SHIFT;
+ phys = __pa(start);
+ pfn = phys >> PAGE_SHIFT;
page = pfn_to_online_page(pfn);
/*
@@ -544,7 +563,8 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
* and explicitly excluded physical ranges.
*/
if (!page || PageOffline(page) ||
- is_page_hwpoison(page) || !pfn_is_ram(pfn)) {
+ is_page_hwpoison(page) || !pfn_is_ram(pfn) ||
+ pfn_is_unaccepted_memory(pfn)) {
if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
@@ -554,11 +574,38 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
fallthrough;
case KCORE_VMEMMAP:
case KCORE_TEXT:
+ if (m->type == KCORE_RAM) {
+ __start = kc_xlate_dev_mem_ptr(phys);
+ if (!__start) {
+ ret = -ENOMEM;
+ if (iov_iter_zero(tsz, iter) != tsz)
+ ret = -EFAULT;
+ goto out;
+ }
+ } else {
+ __start = (void *)start;
+ }
+
/*
- * We use _copy_to_iter() to bypass usermode hardening
- * which would otherwise prevent this operation.
+ * Sadly we must use a bounce buffer here to be able to
+ * make use of copy_from_kernel_nofault(), as these
+ * memory regions might not always be mapped on all
+ * architectures.
*/
- if (_copy_to_iter((char *)start, tsz, iter) != tsz) {
+ ret = copy_from_kernel_nofault(buf, __start, tsz);
+ if (m->type == KCORE_RAM)
+ kc_unxlate_dev_mem_ptr(phys, __start);
+ if (ret) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = 0;
+ /*
+ * We know the bounce buffer is safe to copy from, so
+ * use _copy_to_iter() directly.
+ */
+ } else if (_copy_to_iter(buf, tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -579,7 +626,7 @@ skip:
out:
page_offline_thaw();
- up_read(&kclist_lock);
+ percpu_up_read(&kclist_lock);
if (ret)
return ret;
return orig_buflen - buflen;
@@ -595,6 +642,10 @@ static int open_kcore(struct inode *inode, struct file *filp)
if (ret)
return ret;
+ filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!filp->private_data)
+ return -ENOMEM;
+
if (kcore_need_update)
kcore_update_ram();
if (i_size_read(inode) != proc_root_kcore->size) {
@@ -605,9 +656,17 @@ static int open_kcore(struct inode *inode, struct file *filp)
return 0;
}
+static int release_kcore(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
static const struct proc_ops kcore_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_read_iter = read_kcore_iter,
.proc_open = open_kcore,
+ .proc_release = release_kcore,
.proc_lseek = default_llseek,
};
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 8dca4d6d96c7..a458f1e112fd 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -17,6 +17,7 @@
#ifdef CONFIG_CMA
#include <linux/cma.h>
#endif
+#include <linux/zswap.h>
#include <asm/page.h>
#include "internal.h"
@@ -88,10 +89,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "SwapTotal: ", i.totalswap);
show_val_kb(m, "SwapFree: ", i.freeswap);
#ifdef CONFIG_ZSWAP
- seq_printf(m, "Zswap: %8lu kB\n",
- (unsigned long)(zswap_pool_total_size >> 10));
+ show_val_kb(m, "Zswap: ", zswap_total_pages());
seq_printf(m, "Zswapped: %8lu kB\n",
- (unsigned long)atomic_read(&zswap_stored_pages) <<
+ (unsigned long)atomic_long_read(&zswap_stored_pages) <<
(PAGE_SHIFT - 10));
#endif
show_val_kb(m, "Dirty: ",
@@ -120,10 +120,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
global_node_page_state(NR_SECONDARY_PAGETABLE));
show_val_kb(m, "NFS_Unstable: ", 0);
- show_val_kb(m, "Bounce: ",
- global_zone_page_state(NR_BOUNCE));
- show_val_kb(m, "WritebackTmp: ",
- global_node_page_state(NR_WRITEBACK_TEMP));
+ show_val_kb(m, "Bounce: ", 0);
+ show_val_kb(m, "WritebackTmp: ", 0);
show_val_kb(m, "CommitLimit: ", vm_commit_limit());
show_val_kb(m, "Committed_AS: ", committed);
seq_printf(m, "VmallocTotal: %8lu kB\n",
@@ -132,17 +130,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "VmallocChunk: ", 0ul);
show_val_kb(m, "Percpu: ", pcpu_nr_pages());
-#ifdef CONFIG_MEMTEST
- if (early_memtest_done) {
- unsigned long early_memtest_bad_size_kb;
-
- early_memtest_bad_size_kb = early_memtest_bad_size>>10;
- if (early_memtest_bad_size && !early_memtest_bad_size_kb)
- early_memtest_bad_size_kb = 1;
- /* When 0 is reported, it means there actually was a successful test */
- seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb);
- }
-#endif
+ memtest_report_meminfo(m);
#ifdef CONFIG_MEMORY_FAILURE
seq_printf(m, "HardwareCorrupted: %5lu kB\n",
@@ -172,6 +160,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "Unaccepted: ",
global_zone_page_state(NR_UNACCEPTED));
#endif
+ show_val_kb(m, "Balloon: ",
+ global_node_page_state(NR_BALLOON_PAGES));
hugetlb_report_meminfo(m);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 8e159fc78c0a..ea2b597fd92c 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -12,7 +12,7 @@
#include "internal.h"
-static const struct proc_ns_operations *ns_entries[] = {
+static const struct proc_ns_operations *const ns_entries[] = {
#ifdef CONFIG_NET_NS
&netns_operations,
#endif
@@ -83,7 +83,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
res = ns_get_name(name, sizeof(name), task, ns_ops);
if (res >= 0)
- res = readlink_copy(buffer, buflen, name);
+ res = readlink_copy(buffer, buflen, name, strlen(name));
}
put_task_struct(task);
return res;
@@ -111,14 +111,13 @@ static struct dentry *proc_ns_instantiate(struct dentry *dentry,
ei->ns_ops = ns_ops;
pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
{
struct task_struct *task = get_proc_task(file_inode(file));
- const struct proc_ns_operations **entry, **last;
+ const struct proc_ns_operations *const *entry, *const *last;
if (!task)
return -ENOENT;
@@ -152,7 +151,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
struct dentry *dentry, unsigned int flags)
{
struct task_struct *task = get_proc_task(dir);
- const struct proc_ns_operations **entry, **last;
+ const struct proc_ns_operations *const *entry, *const *last;
unsigned int len = dentry->d_name.len;
struct dentry *res = ERR_PTR(-ENOENT);
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 4d3493579458..c6e7ebc63756 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -58,7 +58,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
if (file) {
seq_pad(m, ' ');
- seq_file_path(m, file, "");
+ seq_path(m, file_user_path(file), "");
}
seq_putc(m, '\n');
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 195b077c0fac..f9b2c2c906cd 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -20,7 +20,12 @@
#define KPMSIZE sizeof(u64)
#define KPMMASK (KPMSIZE - 1)
-#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
+
+enum kpage_operation {
+ KPAGE_FLAGS,
+ KPAGE_COUNT,
+ KPAGE_CGROUP,
+};
static inline unsigned long get_max_dump_pfn(void)
{
@@ -37,21 +42,33 @@ static inline unsigned long get_max_dump_pfn(void)
#endif
}
-/* /proc/kpagecount - an array exposing page counts
- *
- * Each entry is a u64 representing the corresponding
- * physical page count.
- */
-static ssize_t kpagecount_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+static u64 get_kpage_count(const struct page *page)
+{
+ struct page_snapshot ps;
+ u64 ret;
+
+ snapshot_page(&ps, page);
+
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ ret = folio_precise_page_mapcount(&ps.folio_snapshot,
+ &ps.page_snapshot);
+ else
+ ret = folio_average_page_mapcount(&ps.folio_snapshot);
+
+ return ret;
+}
+
+static ssize_t kpage_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos,
+ enum kpage_operation op)
{
const unsigned long max_dump_pfn = get_max_dump_pfn();
u64 __user *out = (u64 __user *)buf;
- struct page *ppage;
+ struct page *page;
unsigned long src = *ppos;
unsigned long pfn;
ssize_t ret = 0;
- u64 pcount;
+ u64 info;
pfn = src / KPMSIZE;
if (src & KPMMASK || count & KPMMASK)
@@ -65,14 +82,27 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
* TODO: ZONE_DEVICE support requires to identify
* memmaps that were actually initialized.
*/
- ppage = pfn_to_online_page(pfn);
-
- if (!ppage || PageSlab(ppage) || page_has_type(ppage))
- pcount = 0;
- else
- pcount = page_mapcount(ppage);
-
- if (put_user(pcount, out)) {
+ page = pfn_to_online_page(pfn);
+
+ if (page) {
+ switch (op) {
+ case KPAGE_FLAGS:
+ info = stable_page_flags(page);
+ break;
+ case KPAGE_COUNT:
+ info = get_kpage_count(page);
+ break;
+ case KPAGE_CGROUP:
+ info = page_cgroup_ino(page);
+ break;
+ default:
+ info = 0;
+ break;
+ }
+ } else
+ info = 0;
+
+ if (put_user(info, out)) {
ret = -EFAULT;
break;
}
@@ -90,27 +120,37 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
return ret;
}
+/* /proc/kpagecount - an array exposing page mapcounts
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page mapcount.
+ */
+static ssize_t kpagecount_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return kpage_read(file, buf, count, ppos, KPAGE_COUNT);
+}
+
static const struct proc_ops kpagecount_proc_ops = {
.proc_flags = PROC_ENTRY_PERMANENT,
.proc_lseek = mem_lseek,
.proc_read = kpagecount_read,
};
-/* /proc/kpageflags - an array exposing page flags
- *
- * Each entry is a u64 representing the corresponding
- * physical page flags.
- */
static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
{
return ((kflags >> kbit) & 1) << ubit;
}
-u64 stable_page_flags(struct page *page)
+u64 stable_page_flags(const struct page *page)
{
- u64 k;
- u64 u;
+ const struct folio *folio;
+ struct page_snapshot ps;
+ unsigned long k;
+ unsigned long mapping;
+ bool is_anon;
+ u64 u = 0;
/*
* pseudo flag: KPF_NOPAGE
@@ -119,75 +159,63 @@ u64 stable_page_flags(struct page *page)
if (!page)
return 1 << KPF_NOPAGE;
- k = page->flags;
- u = 0;
+ snapshot_page(&ps, page);
+ folio = &ps.folio_snapshot;
+
+ k = folio->flags.f;
+ mapping = (unsigned long)folio->mapping;
+ is_anon = mapping & FOLIO_MAPPING_ANON;
/*
* pseudo flags for the well known (anonymous) memory mapped pages
- *
- * Note that page->_mapcount is overloaded in SLAB, so the
- * simple test in page_mapped() is not enough.
*/
- if (!PageSlab(page) && page_mapped(page))
+ if (folio_mapped(folio))
u |= 1 << KPF_MMAP;
- if (PageAnon(page))
+ if (is_anon) {
u |= 1 << KPF_ANON;
- if (PageKsm(page))
- u |= 1 << KPF_KSM;
+ if (mapping & FOLIO_MAPPING_KSM)
+ u |= 1 << KPF_KSM;
+ }
/*
* compound pages: export both head/tail info
* they together define a compound page's start/end pos and order
*/
- if (PageHead(page))
- u |= 1 << KPF_COMPOUND_HEAD;
- if (PageTail(page))
+ if (ps.idx == 0)
+ u |= kpf_copy_bit(k, KPF_COMPOUND_HEAD, PG_head);
+ else
u |= 1 << KPF_COMPOUND_TAIL;
- if (PageHuge(page))
+ if (folio_test_hugetlb(folio))
u |= 1 << KPF_HUGE;
- /*
- * PageTransCompound can be true for non-huge compound pages (slab
- * pages or pages allocated by drivers with __GFP_COMP) because it
- * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
- * to make sure a given page is a thp, not a non-huge compound page.
- */
- else if (PageTransCompound(page)) {
- struct page *head = compound_head(page);
-
- if (PageLRU(head) || PageAnon(head))
- u |= 1 << KPF_THP;
- else if (is_huge_zero_page(head)) {
- u |= 1 << KPF_ZERO_PAGE;
- u |= 1 << KPF_THP;
- }
- } else if (is_zero_pfn(page_to_pfn(page)))
+ else if (folio_test_large(folio) &&
+ folio_test_large_rmappable(folio)) {
+ /* Note: we indicate any THPs here, not just PMD-sized ones */
+ u |= 1 << KPF_THP;
+ } else if (is_huge_zero_pfn(ps.pfn)) {
u |= 1 << KPF_ZERO_PAGE;
+ u |= 1 << KPF_THP;
+ } else if (is_zero_pfn(ps.pfn)) {
+ u |= 1 << KPF_ZERO_PAGE;
+ }
-
- /*
- * Caveats on high order pages: PG_buddy and PG_slab will only be set
- * on the head page.
- */
- if (PageBuddy(page))
- u |= 1 << KPF_BUDDY;
- else if (page_count(page) == 0 && is_free_buddy_page(page))
+ if (ps.flags & PAGE_SNAPSHOT_PG_BUDDY)
u |= 1 << KPF_BUDDY;
- if (PageOffline(page))
+ if (folio_test_offline(folio))
u |= 1 << KPF_OFFLINE;
- if (PageTable(page))
+ if (folio_test_pgtable(folio))
u |= 1 << KPF_PGTABLE;
+ if (folio_test_slab(folio))
+ u |= 1 << KPF_SLAB;
- if (page_is_idle(page))
+#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
+ u |= kpf_copy_bit(k, KPF_IDLE, PG_idle);
+#else
+ if (ps.flags & PAGE_SNAPSHOT_PG_IDLE)
u |= 1 << KPF_IDLE;
+#endif
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
-
- u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
- if (PageTail(page) && PageSlab(page))
- u |= 1 << KPF_SLAB;
-
- u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate);
u |= kpf_copy_bit(k, KPF_WRITEBACK, PG_writeback);
@@ -197,7 +225,8 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active);
u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim);
- if (PageSwapCache(page))
+#define SWAPCACHE ((1 << PG_swapbacked) | (1 << PG_swapcache))
+ if ((k & SWAPCACHE) == SWAPCACHE)
u |= 1 << KPF_SWAPCACHE;
u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked);
@@ -205,67 +234,38 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked);
#ifdef CONFIG_MEMORY_FAILURE
- u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
-#endif
-
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
- u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached);
+ if (u & (1 << KPF_HUGE))
+ u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
+ else
+ u |= kpf_copy_bit(ps.page_snapshot.flags.f, KPF_HWPOISON, PG_hwpoison);
#endif
u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved);
- u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk);
+ u |= kpf_copy_bit(k, KPF_OWNER_2, PG_owner_2);
u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private);
u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2);
u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1);
-#ifdef CONFIG_ARCH_USES_PG_ARCH_X
+#ifdef CONFIG_ARCH_USES_PG_ARCH_2
u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2);
+#endif
+#ifdef CONFIG_ARCH_USES_PG_ARCH_3
u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3);
#endif
return u;
-};
+}
+EXPORT_SYMBOL_GPL(stable_page_flags);
+/* /proc/kpageflags - an array exposing page flags
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page flags.
+ */
static ssize_t kpageflags_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+ size_t count, loff_t *ppos)
{
- const unsigned long max_dump_pfn = get_max_dump_pfn();
- u64 __user *out = (u64 __user *)buf;
- struct page *ppage;
- unsigned long src = *ppos;
- unsigned long pfn;
- ssize_t ret = 0;
-
- pfn = src / KPMSIZE;
- if (src & KPMMASK || count & KPMMASK)
- return -EINVAL;
- if (src >= max_dump_pfn * KPMSIZE)
- return 0;
- count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
-
- while (count > 0) {
- /*
- * TODO: ZONE_DEVICE support requires to identify
- * memmaps that were actually initialized.
- */
- ppage = pfn_to_online_page(pfn);
-
- if (put_user(stable_page_flags(ppage), out)) {
- ret = -EFAULT;
- break;
- }
-
- pfn++;
- out++;
- count -= KPMSIZE;
-
- cond_resched();
- }
-
- *ppos += (char __user *)out - buf;
- if (!ret)
- ret = (char __user *)out - buf;
- return ret;
+ return kpage_read(file, buf, count, ppos, KPAGE_FLAGS);
}
static const struct proc_ops kpageflags_proc_ops = {
@@ -276,53 +276,10 @@ static const struct proc_ops kpageflags_proc_ops = {
#ifdef CONFIG_MEMCG
static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+ size_t count, loff_t *ppos)
{
- const unsigned long max_dump_pfn = get_max_dump_pfn();
- u64 __user *out = (u64 __user *)buf;
- struct page *ppage;
- unsigned long src = *ppos;
- unsigned long pfn;
- ssize_t ret = 0;
- u64 ino;
-
- pfn = src / KPMSIZE;
- if (src & KPMMASK || count & KPMMASK)
- return -EINVAL;
- if (src >= max_dump_pfn * KPMSIZE)
- return 0;
- count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
-
- while (count > 0) {
- /*
- * TODO: ZONE_DEVICE support requires to identify
- * memmaps that were actually initialized.
- */
- ppage = pfn_to_online_page(pfn);
-
- if (ppage)
- ino = page_cgroup_ino(ppage);
- else
- ino = 0;
-
- if (put_user(ino, out)) {
- ret = -EFAULT;
- break;
- }
-
- pfn++;
- out++;
- count -= KPMSIZE;
-
- cond_resched();
- }
-
- *ppos += (char __user *)out - buf;
- if (!ret)
- ret = (char __user *)out - buf;
- return ret;
+ return kpage_read(file, buf, count, ppos, KPAGE_CGROUP);
}
-
static const struct proc_ops kpagecgroup_proc_ops = {
.proc_flags = PROC_ENTRY_PERMANENT,
.proc_lseek = mem_lseek,
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index a0c0419872e3..52f0b75cbce2 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -135,6 +135,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_data);
* @parent: The parent directory in which to create.
* @ops: The seq_file ops with which to read the file.
* @write: The write method with which to 'modify' the file.
+ * @state_size: The size of the per-file private state to allocate.
* @data: Data for retrieval by pde_data().
*
* Create a network namespaced proc file in the @parent directory with the
@@ -308,7 +309,7 @@ static int proc_tgid_net_getattr(struct mnt_idmap *idmap,
net = get_proc_task_net(inode);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (net != NULL) {
stat->nlink = net->proc_net->nlink;
@@ -321,6 +322,7 @@ static int proc_tgid_net_getattr(struct mnt_idmap *idmap,
const struct inode_operations proc_net_inode_operations = {
.lookup = proc_tgid_net_lookup,
.getattr = proc_tgid_net_getattr,
+ .setattr = proc_setattr,
};
static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5ea42653126e..49ab74e0bfde 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -17,10 +17,12 @@
#include <linux/bpf-cgroup.h>
#include <linux/mount.h>
#include <linux/kmemleak.h>
+#include <linux/lockdep.h>
#include "internal.h"
-#define list_for_each_table_entry(entry, table) \
- for ((entry) = (table); (entry)->procname; (entry)++)
+#define list_for_each_table_entry(entry, header) \
+ entry = header->ctl_table; \
+ for (size_t i = 0 ; i < header->ctl_table_size; ++i, entry++)
static const struct dentry_operations proc_sys_dentry_operations;
static const struct file_operations proc_sys_file_operations;
@@ -28,9 +30,12 @@ static const struct inode_operations proc_sys_inode_operations;
static const struct file_operations proc_sys_dir_file_operations;
static const struct inode_operations proc_sys_dir_operations;
-/* Support for permanently empty directories */
-static struct ctl_table sysctl_mount_point[] = {
- {.type = SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY }
+/*
+ * Support for permanently empty directories.
+ * Must be non-empty to avoid sharing an address with other tables.
+ */
+static const struct ctl_table sysctl_mount_point[] = {
+ { }
};
/**
@@ -43,18 +48,16 @@ static struct ctl_table sysctl_mount_point[] = {
*/
struct ctl_table_header *register_sysctl_mount_point(const char *path)
{
- return register_sysctl(path, sysctl_mount_point);
+ return register_sysctl_sz(path, sysctl_mount_point, 0);
}
EXPORT_SYMBOL(register_sysctl_mount_point);
-#define sysctl_is_perm_empty_ctl_table(tptr) \
- (tptr[0].type == SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY)
#define sysctl_is_perm_empty_ctl_header(hptr) \
- (sysctl_is_perm_empty_ctl_table(hptr->ctl_table))
+ (hptr->type == SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY)
#define sysctl_set_perm_empty_ctl_header(hptr) \
- (hptr->ctl_table[0].type = SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY)
+ (hptr->type = SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY)
#define sysctl_clear_perm_empty_ctl_header(hptr) \
- (hptr->ctl_table[0].type = SYSCTL_TABLE_TYPE_DEFAULT)
+ (hptr->type = SYSCTL_TABLE_TYPE_DEFAULT)
void proc_sys_poll_notify(struct ctl_table_poll *poll)
{
@@ -65,12 +68,11 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll)
wake_up_interruptible(&poll->wait);
}
-static struct ctl_table root_table[] = {
+static const struct ctl_table root_table[] = {
{
.procname = "",
.mode = S_IFDIR|S_IRUGO|S_IXUGO,
},
- { }
};
static struct ctl_table_root sysctl_table_root = {
.default_set.dir.header = {
@@ -87,7 +89,7 @@ static DEFINE_SPINLOCK(sysctl_lock);
static void drop_sysctl_table(struct ctl_table_header *header);
static int sysctl_follow_link(struct ctl_table_header **phead,
- struct ctl_table **pentry);
+ const struct ctl_table **pentry);
static int insert_links(struct ctl_table_header *head);
static void put_links(struct ctl_table_header *header);
@@ -108,14 +110,15 @@ static int namecmp(const char *name1, int len1, const char *name2, int len2)
return cmp;
}
-/* Called under sysctl_lock */
-static struct ctl_table *find_entry(struct ctl_table_header **phead,
+static const struct ctl_table *find_entry(struct ctl_table_header **phead,
struct ctl_dir *dir, const char *name, int namelen)
{
struct ctl_table_header *head;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
struct rb_node *node = dir->root.rb_node;
+ lockdep_assert_held(&sysctl_lock);
+
while (node)
{
struct ctl_node *ctl_node;
@@ -140,7 +143,7 @@ static struct ctl_table *find_entry(struct ctl_table_header **phead,
return NULL;
}
-static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
+static int insert_entry(struct ctl_table_header *head, const struct ctl_table *entry)
{
struct rb_node *node = &head->node[entry - head->ctl_table].node;
struct rb_node **p = &head->parent->root.rb_node;
@@ -150,7 +153,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
while (*p) {
struct ctl_table_header *parent_head;
- struct ctl_table *parent_entry;
+ const struct ctl_table *parent_entry;
struct ctl_node *parent_node;
const char *parent_name;
int cmp;
@@ -179,7 +182,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
return 0;
}
-static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
+static void erase_entry(struct ctl_table_header *head, const struct ctl_table *entry)
{
struct rb_node *node = &head->node[entry - head->ctl_table].node;
@@ -188,9 +191,10 @@ static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
static void init_header(struct ctl_table_header *head,
struct ctl_table_root *root, struct ctl_table_set *set,
- struct ctl_node *node, struct ctl_table *table)
+ struct ctl_node *node, const struct ctl_table *table, size_t table_size)
{
head->ctl_table = table;
+ head->ctl_table_size = table_size;
head->ctl_table_arg = table;
head->used = 0;
head->count = 1;
@@ -202,26 +206,28 @@ static void init_header(struct ctl_table_header *head,
head->node = node;
INIT_HLIST_HEAD(&head->inodes);
if (node) {
- struct ctl_table *entry;
+ const struct ctl_table *entry;
- list_for_each_table_entry(entry, table) {
+ list_for_each_table_entry(entry, head) {
node->header = head;
node++;
}
}
+ if (table == sysctl_mount_point)
+ sysctl_set_perm_empty_ctl_header(head);
}
static void erase_header(struct ctl_table_header *head)
{
- struct ctl_table *entry;
+ const struct ctl_table *entry;
- list_for_each_table_entry(entry, head->ctl_table)
+ list_for_each_table_entry(entry, head)
erase_entry(head, entry);
}
static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
{
- struct ctl_table *entry;
+ const struct ctl_table *entry;
struct ctl_table_header *dir_h = &dir->header;
int err;
@@ -231,7 +237,7 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
return -EROFS;
/* Am I creating a permanently empty directory? */
- if (sysctl_is_perm_empty_ctl_table(header->ctl_table)) {
+ if (sysctl_is_perm_empty_ctl_header(header)) {
if (!RB_EMPTY_ROOT(&dir->root))
return -EINVAL;
sysctl_set_perm_empty_ctl_header(dir_h);
@@ -242,7 +248,7 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
err = insert_links(header);
if (err)
goto fail_links;
- list_for_each_table_entry(entry, header->ctl_table) {
+ list_for_each_table_entry(entry, header) {
err = insert_entry(header, entry);
if (err)
goto fail;
@@ -259,18 +265,20 @@ fail_links:
return err;
}
-/* called under sysctl_lock */
static int use_table(struct ctl_table_header *p)
{
+ lockdep_assert_held(&sysctl_lock);
+
if (unlikely(p->unregistering))
return 0;
p->used++;
return 1;
}
-/* called under sysctl_lock */
static void unuse_table(struct ctl_table_header *p)
{
+ lockdep_assert_held(&sysctl_lock);
+
if (!--p->used)
if (unlikely(p->unregistering))
complete(p->unregistering);
@@ -281,9 +289,11 @@ static void proc_sys_invalidate_dcache(struct ctl_table_header *head)
proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock);
}
-/* called under sysctl_lock, will reacquire if has to wait */
static void start_unregistering(struct ctl_table_header *p)
{
+ /* will reacquire if has to wait */
+ lockdep_assert_held(&sysctl_lock);
+
/*
* if p->used is 0, nobody will ever touch that entry again;
* we'll eliminate all paths to it before dropping sysctl_lock
@@ -340,12 +350,12 @@ lookup_header_set(struct ctl_table_root *root)
return set;
}
-static struct ctl_table *lookup_entry(struct ctl_table_header **phead,
- struct ctl_dir *dir,
- const char *name, int namelen)
+static const struct ctl_table *lookup_entry(struct ctl_table_header **phead,
+ struct ctl_dir *dir,
+ const char *name, int namelen)
{
struct ctl_table_header *head;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
spin_lock(&sysctl_lock);
entry = find_entry(&head, dir, name, namelen);
@@ -370,10 +380,10 @@ static struct ctl_node *first_usable_entry(struct rb_node *node)
}
static void first_entry(struct ctl_dir *dir,
- struct ctl_table_header **phead, struct ctl_table **pentry)
+ struct ctl_table_header **phead, const struct ctl_table **pentry)
{
struct ctl_table_header *head = NULL;
- struct ctl_table *entry = NULL;
+ const struct ctl_table *entry = NULL;
struct ctl_node *ctl_node;
spin_lock(&sysctl_lock);
@@ -387,10 +397,10 @@ static void first_entry(struct ctl_dir *dir,
*pentry = entry;
}
-static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry)
+static void next_entry(struct ctl_table_header **phead, const struct ctl_table **pentry)
{
struct ctl_table_header *head = *phead;
- struct ctl_table *entry = *pentry;
+ const struct ctl_table *entry = *pentry;
struct ctl_node *ctl_node = &head->node[entry - head->ctl_table];
spin_lock(&sysctl_lock);
@@ -423,7 +433,7 @@ static int test_perm(int mode, int op)
return -EACCES;
}
-static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op)
+static int sysctl_perm(struct ctl_table_header *head, const struct ctl_table *table, int op)
{
struct ctl_table_root *root = head->root;
int mode;
@@ -437,7 +447,7 @@ static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, i
}
static struct inode *proc_sys_make_inode(struct super_block *sb,
- struct ctl_table_header *head, struct ctl_table *table)
+ struct ctl_table_header *head, const struct ctl_table *table)
{
struct ctl_table_root *root = head->root;
struct inode *inode;
@@ -463,7 +473,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
head->count++;
spin_unlock(&sysctl_lock);
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ simple_inode_init_ts(inode);
inode->i_mode = table->mode;
if (!S_ISDIR(table->mode)) {
inode->i_mode |= S_IFREG;
@@ -477,12 +487,10 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
make_empty_dir_inode(inode);
}
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
if (root->set_ownership)
- root->set_ownership(head, table, &inode->i_uid, &inode->i_gid);
- else {
- inode->i_uid = GLOBAL_ROOT_UID;
- inode->i_gid = GLOBAL_ROOT_GID;
- }
+ root->set_ownership(head, &inode->i_uid, &inode->i_gid);
return inode;
}
@@ -510,7 +518,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
struct ctl_table_header *head = grab_header(dir);
struct ctl_table_header *h = NULL;
const struct qstr *name = &dentry->d_name;
- struct ctl_table *p;
+ const struct ctl_table *p;
struct inode *inode;
struct dentry *err = ERR_PTR(-ENOENT);
struct ctl_dir *ctl_dir;
@@ -533,13 +541,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
}
inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
- if (IS_ERR(inode)) {
- err = ERR_CAST(inode);
- goto out;
- }
-
- d_set_d_op(dentry, &proc_sys_dentry_operations);
- err = d_splice_alias(inode, dentry);
+ err = d_splice_alias_ops(inode, dentry, &proc_sys_dentry_operations);
out:
if (h)
@@ -553,7 +555,7 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
{
struct inode *inode = file_inode(iocb->ki_filp);
struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
size_t count = iov_iter_count(iter);
char *kbuf;
ssize_t error;
@@ -627,7 +629,7 @@ static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter)
static int proc_sys_open(struct inode *inode, struct file *filp)
{
struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
/* sysctl was unregistered */
if (IS_ERR(head))
@@ -645,7 +647,7 @@ static __poll_t proc_sys_poll(struct file *filp, poll_table *wait)
{
struct inode *inode = file_inode(filp);
struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
__poll_t ret = DEFAULT_POLLMASK;
unsigned long event;
@@ -676,7 +678,7 @@ out:
static bool proc_sys_fill_cache(struct file *file,
struct dir_context *ctx,
struct ctl_table_header *head,
- struct ctl_table *table)
+ const struct ctl_table *table)
{
struct dentry *child, *dir = file->f_path.dentry;
struct inode *inode;
@@ -697,20 +699,15 @@ static bool proc_sys_fill_cache(struct file *file,
if (d_in_lookup(child)) {
struct dentry *res;
inode = proc_sys_make_inode(dir->d_sb, head, table);
- if (IS_ERR(inode)) {
- d_lookup_done(child);
- dput(child);
- return false;
- }
- d_set_d_op(child, &proc_sys_dentry_operations);
- res = d_splice_alias(inode, child);
+ res = d_splice_alias_ops(inode, child,
+ &proc_sys_dentry_operations);
d_lookup_done(child);
if (unlikely(res)) {
- if (IS_ERR(res)) {
- dput(child);
- return false;
- }
dput(child);
+
+ if (IS_ERR(res))
+ return false;
+
child = res;
}
}
@@ -725,7 +722,7 @@ static bool proc_sys_fill_cache(struct file *file,
static bool proc_sys_link_fill_cache(struct file *file,
struct dir_context *ctx,
struct ctl_table_header *head,
- struct ctl_table *table)
+ const struct ctl_table *table)
{
bool ret = true;
@@ -743,7 +740,7 @@ out:
return ret;
}
-static int scan(struct ctl_table_header *head, struct ctl_table *table,
+static int scan(struct ctl_table_header *head, const struct ctl_table *table,
unsigned long *pos, struct file *file,
struct dir_context *ctx)
{
@@ -767,7 +764,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
{
struct ctl_table_header *head = grab_header(file_inode(file));
struct ctl_table_header *h = NULL;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
struct ctl_dir *ctl_dir;
unsigned long pos;
@@ -800,7 +797,7 @@ static int proc_sys_permission(struct mnt_idmap *idmap,
* are _NOT_ writeable, capabilities or not.
*/
struct ctl_table_header *head;
- struct ctl_table *table;
+ const struct ctl_table *table;
int error;
/* Executable files are not allowed under /proc/sys/ */
@@ -844,12 +841,12 @@ static int proc_sys_getattr(struct mnt_idmap *idmap,
{
struct inode *inode = d_inode(path->dentry);
struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
if (IS_ERR(head))
return PTR_ERR(head);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (table)
stat->mode = (stat->mode & S_IFMT) | table->mode;
@@ -886,7 +883,8 @@ static const struct inode_operations proc_sys_dir_operations = {
.getattr = proc_sys_getattr,
};
-static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags)
+static int proc_sys_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -919,17 +917,21 @@ static int proc_sys_compare(const struct dentry *dentry,
struct ctl_table_header *head;
struct inode *inode;
- /* Although proc doesn't have negative dentries, rcu-walk means
- * that inode here can be NULL */
- /* AV: can it, indeed? */
- inode = d_inode_rcu(dentry);
- if (!inode)
- return 1;
if (name->len != len)
return 1;
if (memcmp(name->name, str, len))
return 1;
- head = rcu_dereference(PROC_I(inode)->sysctl);
+
+ // false positive is fine here - we'll recheck anyway
+ if (d_in_lookup(dentry))
+ return 0;
+
+ inode = d_inode_rcu(dentry);
+ // we just might have run into dentry in the middle of __dentry_kill()
+ if (!inode)
+ return 1;
+
+ head = READ_ONCE(PROC_I(inode)->sysctl);
return !head || !sysctl_is_seen(head);
}
@@ -943,7 +945,7 @@ static struct ctl_dir *find_subdir(struct ctl_dir *dir,
const char *name, int namelen)
{
struct ctl_table_header *head;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
entry = find_entry(&head, dir, name, namelen);
if (!entry)
@@ -962,18 +964,18 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set,
char *new_name;
new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) +
- sizeof(struct ctl_table)*2 + namelen + 1,
+ sizeof(struct ctl_table) + namelen + 1,
GFP_KERNEL);
if (!new)
return NULL;
node = (struct ctl_node *)(new + 1);
table = (struct ctl_table *)(node + 1);
- new_name = (char *)(table + 2);
+ new_name = (char *)(table + 1);
memcpy(new_name, name, namelen);
table[0].procname = new_name;
table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
- init_header(&new->header, set->dir.header.root, set, node, table);
+ init_header(&new->header, set->dir.header.root, set, node, table, 1);
return new;
}
@@ -1054,12 +1056,12 @@ static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
}
static int sysctl_follow_link(struct ctl_table_header **phead,
- struct ctl_table **pentry)
+ const struct ctl_table **pentry)
{
struct ctl_table_header *head;
+ const struct ctl_table *entry;
struct ctl_table_root *root;
struct ctl_table_set *set;
- struct ctl_table *entry;
struct ctl_dir *dir;
int ret;
@@ -1086,7 +1088,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead,
return ret;
}
-static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
+static int sysctl_err(const char *path, const struct ctl_table *table, char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -1102,8 +1104,9 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
return -EINVAL;
}
-static int sysctl_check_table_array(const char *path, struct ctl_table *table)
+static int sysctl_check_table_array(const char *path, const struct ctl_table *table)
{
+ unsigned int extra;
int err = 0;
if ((table->proc_handler == proc_douintvec) ||
@@ -1115,6 +1118,19 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table)
if (table->proc_handler == proc_dou8vec_minmax) {
if (table->maxlen != sizeof(u8))
err |= sysctl_err(path, table, "array not allowed");
+
+ if (table->extra1) {
+ extra = *(unsigned int *) table->extra1;
+ if (extra > 255U)
+ err |= sysctl_err(path, table,
+ "range value too large for proc_dou8vec_minmax");
+ }
+ if (table->extra2) {
+ extra = *(unsigned int *) table->extra2;
+ if (extra > 255U)
+ err |= sysctl_err(path, table,
+ "range value too large for proc_dou8vec_minmax");
+ }
}
if (table->proc_handler == proc_dobool) {
@@ -1125,11 +1141,13 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table)
return err;
}
-static int sysctl_check_table(const char *path, struct ctl_table *table)
+static int sysctl_check_table(const char *path, struct ctl_table_header *header)
{
- struct ctl_table *entry;
+ const struct ctl_table *entry;
int err = 0;
- list_for_each_table_entry(entry, table) {
+ list_for_each_table_entry(entry, header) {
+ if (!entry->procname)
+ err |= sysctl_err(path, entry, "procname is null");
if ((entry->proc_handler == proc_dostring) ||
(entry->proc_handler == proc_dobool) ||
(entry->proc_handler == proc_dointvec) ||
@@ -1159,25 +1177,23 @@ static int sysctl_check_table(const char *path, struct ctl_table *table)
return err;
}
-static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table,
- struct ctl_table_root *link_root)
+static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_header *head)
{
- struct ctl_table *link_table, *entry, *link;
+ struct ctl_table *link_table, *link;
struct ctl_table_header *links;
+ const struct ctl_table *entry;
struct ctl_node *node;
char *link_name;
- int nr_entries, name_bytes;
+ int name_bytes;
name_bytes = 0;
- nr_entries = 0;
- list_for_each_table_entry(entry, table) {
- nr_entries++;
+ list_for_each_table_entry(entry, head) {
name_bytes += strlen(entry->procname) + 1;
}
links = kzalloc(sizeof(struct ctl_table_header) +
- sizeof(struct ctl_node)*nr_entries +
- sizeof(struct ctl_table)*(nr_entries + 1) +
+ sizeof(struct ctl_node)*head->ctl_table_size +
+ sizeof(struct ctl_table)*head->ctl_table_size +
name_bytes,
GFP_KERNEL);
@@ -1185,35 +1201,41 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table
return NULL;
node = (struct ctl_node *)(links + 1);
- link_table = (struct ctl_table *)(node + nr_entries);
- link_name = (char *)&link_table[nr_entries + 1];
+ link_table = (struct ctl_table *)(node + head->ctl_table_size);
+ link_name = (char *)(link_table + head->ctl_table_size);
link = link_table;
- list_for_each_table_entry(entry, table) {
+ list_for_each_table_entry(entry, head) {
int len = strlen(entry->procname) + 1;
memcpy(link_name, entry->procname, len);
link->procname = link_name;
link->mode = S_IFLNK|S_IRWXUGO;
- link->data = link_root;
+ link->data = head->root;
link_name += len;
link++;
}
- init_header(links, dir->header.root, dir->header.set, node, link_table);
- links->nreg = nr_entries;
+ init_header(links, dir->header.root, dir->header.set, node, link_table,
+ head->ctl_table_size);
+ links->nreg = head->ctl_table_size;
return links;
}
static bool get_links(struct ctl_dir *dir,
- struct ctl_table *table, struct ctl_table_root *link_root)
+ struct ctl_table_header *header,
+ struct ctl_table_root *link_root)
{
- struct ctl_table_header *head;
- struct ctl_table *entry, *link;
+ struct ctl_table_header *tmp_head;
+ const struct ctl_table *entry, *link;
+
+ if (header->ctl_table_size == 0 ||
+ sysctl_is_perm_empty_ctl_header(header))
+ return true;
/* Are there links available for every entry in table? */
- list_for_each_table_entry(entry, table) {
+ list_for_each_table_entry(entry, header) {
const char *procname = entry->procname;
- link = find_entry(&head, dir, procname, strlen(procname));
+ link = find_entry(&tmp_head, dir, procname, strlen(procname));
if (!link)
return false;
if (S_ISDIR(link->mode) && S_ISDIR(entry->mode))
@@ -1224,10 +1246,10 @@ static bool get_links(struct ctl_dir *dir,
}
/* The checks passed. Increase the registration count on the links */
- list_for_each_table_entry(entry, table) {
+ list_for_each_table_entry(entry, header) {
const char *procname = entry->procname;
- link = find_entry(&head, dir, procname, strlen(procname));
- head->nreg++;
+ link = find_entry(&tmp_head, dir, procname, strlen(procname));
+ tmp_head->nreg++;
}
return true;
}
@@ -1246,13 +1268,13 @@ static int insert_links(struct ctl_table_header *head)
if (IS_ERR(core_parent))
return 0;
- if (get_links(core_parent, head->ctl_table, head->root))
+ if (get_links(core_parent, head, head->root))
return 0;
core_parent->header.nreg++;
spin_unlock(&sysctl_lock);
- links = new_links(core_parent, head->ctl_table, head->root);
+ links = new_links(core_parent, head);
spin_lock(&sysctl_lock);
err = -ENOMEM;
@@ -1260,7 +1282,7 @@ static int insert_links(struct ctl_table_header *head)
goto out;
err = 0;
- if (get_links(core_parent, head->ctl_table, head->root)) {
+ if (get_links(core_parent, head, head->root)) {
kfree(links);
goto out;
}
@@ -1306,27 +1328,23 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path)
* __register_sysctl_table - register a leaf sysctl table
* @set: Sysctl tree to register on
* @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure without any child. This table
- * should not be free'd after registration. So it should not be
- * used on stack. It can either be a global or dynamically allocated
- * by the caller and free'd later after sysctl unregistration.
+ *
+ * @table: the top-level table structure. This table should not be free'd
+ * after registration. So it should not be used on stack. It can either
+ * be a global or dynamically allocated by the caller and free'd later
+ * after sysctl unregistration.
+ * @table_size : The number of elements in table
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
+ * array.
*
* The members of the &struct ctl_table structure are used as follows:
- *
* procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
* enter a sysctl file
- *
- * data - a pointer to data for use by proc_handler
- *
- * maxlen - the maximum size in bytes of the data
- *
- * mode - the file permissions for the /proc/sys file
- *
- * child - must be %NULL.
- *
+ * data - a pointer to data for use by proc_handler
+ * maxlen - the maximum size in bytes of the data
+ * mode - the file permissions for the /proc/sys file
+ * type - Defines the target type (described in struct definition)
* proc_handler - the text handler routine (described below)
*
* extra1, extra2 - extra pointers usable by the proc handler routines
@@ -1334,8 +1352,7 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path)
* [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org
*
* Leaf nodes in the sysctl tree will be represented by a single file
- * under /proc; non-leaf nodes (where child is not NULL) are not allowed,
- * sysctl_check_table() verifies this.
+ * under /proc; non-leaf nodes are not allowed.
*
* There must be a proc_handler routine for any terminal nodes.
* Several default handlers are available to cover common cases -
@@ -1352,26 +1369,21 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path)
*/
struct ctl_table_header *__register_sysctl_table(
struct ctl_table_set *set,
- const char *path, struct ctl_table *table)
+ const char *path, const struct ctl_table *table, size_t table_size)
{
struct ctl_table_root *root = set->dir.header.root;
struct ctl_table_header *header;
struct ctl_dir *dir;
- struct ctl_table *entry;
struct ctl_node *node;
- int nr_entries = 0;
-
- list_for_each_table_entry(entry, table)
- nr_entries++;
header = kzalloc(sizeof(struct ctl_table_header) +
- sizeof(struct ctl_node)*nr_entries, GFP_KERNEL_ACCOUNT);
+ sizeof(struct ctl_node)*table_size, GFP_KERNEL_ACCOUNT);
if (!header)
return NULL;
node = (struct ctl_node *)(header + 1);
- init_header(header, root, set, node, table);
- if (sysctl_check_table(path, table))
+ init_header(header, root, set, node, table, table_size);
+ if (sysctl_check_table(path, header))
goto fail;
spin_lock(&sysctl_lock);
@@ -1401,7 +1413,7 @@ fail:
}
/**
- * register_sysctl - register a sysctl table
+ * register_sysctl_sz - register a sysctl table
* @path: The path to the directory the sysctl table is in. If the path
* doesn't exist we will create it for you.
* @table: the table structure. The calller must ensure the life of the @table
@@ -1411,18 +1423,20 @@ fail:
* to call unregister_sysctl_table() and can instead use something like
* register_sysctl_init() which does not care for the result of the syctl
* registration.
+ * @table_size: The number of elements in table.
*
* Register a sysctl table. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
*
* See __register_sysctl_table for more details.
*/
-struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table)
+struct ctl_table_header *register_sysctl_sz(const char *path, const struct ctl_table *table,
+ size_t table_size)
{
return __register_sysctl_table(&sysctl_table_root.default_set,
- path, table);
+ path, table, table_size);
}
-EXPORT_SYMBOL(register_sysctl);
+EXPORT_SYMBOL(register_sysctl_sz);
/**
* __register_sysctl_init() - register sysctl table to path
@@ -1433,6 +1447,7 @@ EXPORT_SYMBOL(register_sysctl);
* lifetime use of the sysctl.
* @table_name: The name of sysctl table, only used for log printing when
* registration fails
+ * @table_size: The number of elements in table
*
* The sysctl interface is used by userspace to query or modify at runtime
* a predefined value set on a variable. These variables however have default
@@ -1444,13 +1459,13 @@ EXPORT_SYMBOL(register_sysctl);
*
* Context: if your base directory does not exist it will be created for you.
*/
-void __init __register_sysctl_init(const char *path, struct ctl_table *table,
- const char *table_name)
+void __init __register_sysctl_init(const char *path, const struct ctl_table *table,
+ const char *table_name, size_t table_size)
{
- struct ctl_table_header *hdr = register_sysctl(path, table);
+ struct ctl_table_header *hdr = register_sysctl_sz(path, table, table_size);
if (unlikely(!hdr)) {
- pr_err("failed when register_sysctl %s to %s\n", table_name, path);
+ pr_err("failed when register_sysctl_sz %s to %s\n", table_name, path);
return;
}
kmemleak_not_leak(hdr);
@@ -1462,7 +1477,7 @@ static void put_links(struct ctl_table_header *header)
struct ctl_table_root *root = header->root;
struct ctl_dir *parent = header->parent;
struct ctl_dir *core_parent;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
if (header->set == root_set)
return;
@@ -1471,9 +1486,9 @@ static void put_links(struct ctl_table_header *header)
if (IS_ERR(core_parent))
return;
- list_for_each_table_entry(entry, header->ctl_table) {
+ list_for_each_table_entry(entry, header) {
struct ctl_table_header *link_head;
- struct ctl_table *link;
+ const struct ctl_table *link;
const char *name = entry->procname;
link = find_entry(&link_head, core_parent, name, strlen(name));
@@ -1535,7 +1550,7 @@ void setup_sysctl_set(struct ctl_table_set *set,
{
memset(set, 0, sizeof(*set));
set->is_seen = is_seen;
- init_header(&set->dir.header, root, set, NULL, root_table);
+ init_header(&set->dir.header, root, set, NULL, root_table, 1);
}
void retire_sysctl_set(struct ctl_table_set *set)
@@ -1574,7 +1589,6 @@ static const struct sysctl_alias sysctl_aliases[] = {
{"hung_task_panic", "kernel.hung_task_panic" },
{"numa_zonelist_order", "vm.numa_zonelist_order" },
{"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" },
- {"softlockup_panic", "kernel.softlockup_panic" },
{ }
};
@@ -1590,6 +1604,13 @@ static const char *sysctl_find_alias(char *param)
return NULL;
}
+bool sysctl_is_alias(char *param)
+{
+ const char *alias = sysctl_find_alias(param);
+
+ return alias != NULL;
+}
+
/* Set sysctl value passed on kernel command line. */
static int process_sysctl_arg(char *param, char *val,
const char *unused, void *arg)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index a86e65a608da..d8ca41d823e4 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -38,12 +38,14 @@ enum proc_param {
Opt_gid,
Opt_hidepid,
Opt_subset,
+ Opt_pidns,
};
static const struct fs_parameter_spec proc_fs_parameters[] = {
- fsparam_u32("gid", Opt_gid),
+ fsparam_u32("gid", Opt_gid),
fsparam_string("hidepid", Opt_hidepid),
fsparam_string("subset", Opt_subset),
+ fsparam_file_or_string("pidns", Opt_pidns),
{}
};
@@ -109,11 +111,66 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value)
return 0;
}
+#ifdef CONFIG_PID_NS
+static int proc_parse_pidns_param(struct fs_context *fc,
+ struct fs_parameter *param,
+ struct fs_parse_result *result)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+ struct pid_namespace *target, *active = task_active_pid_ns(current);
+ struct ns_common *ns;
+ struct file *ns_filp __free(fput) = NULL;
+
+ switch (param->type) {
+ case fs_value_is_file:
+ /* came through fsconfig, steal the file reference */
+ ns_filp = no_free_ptr(param->file);
+ break;
+ case fs_value_is_string:
+ ns_filp = filp_open(param->string, O_RDONLY, 0);
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ break;
+ }
+ if (!ns_filp)
+ ns_filp = ERR_PTR(-EBADF);
+ if (IS_ERR(ns_filp)) {
+ errorfc(fc, "could not get file from pidns argument");
+ return PTR_ERR(ns_filp);
+ }
+
+ if (!proc_ns_file(ns_filp))
+ return invalfc(fc, "pidns argument is not an nsfs file");
+ ns = get_proc_ns(file_inode(ns_filp));
+ if (ns->ns_type != CLONE_NEWPID)
+ return invalfc(fc, "pidns argument is not a pidns file");
+ target = container_of(ns, struct pid_namespace, ns);
+
+ /*
+ * pidns= is shorthand for joining the pidns to get a fsopen fd, so the
+ * permission model should be the same as pidns_install().
+ */
+ if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) {
+ errorfc(fc, "insufficient permissions to set pidns");
+ return -EPERM;
+ }
+ if (!pidns_is_ancestor(target, active))
+ return invalfc(fc, "cannot set pidns to non-descendant pidns");
+
+ put_pid_ns(ctx->pid_ns);
+ ctx->pid_ns = get_pid_ns(target);
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
+ return 0;
+}
+#endif /* CONFIG_PID_NS */
+
static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct proc_fs_context *ctx = fc->fs_private;
struct fs_parse_result result;
- int opt;
+ int opt, err;
opt = fs_parse(fc, proc_fs_parameters, param, &result);
if (opt < 0)
@@ -125,14 +182,38 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_hidepid:
- if (proc_parse_hidepid_param(fc, param))
- return -EINVAL;
+ err = proc_parse_hidepid_param(fc, param);
+ if (err)
+ return err;
break;
case Opt_subset:
- if (proc_parse_subset_param(fc, param->string) < 0)
- return -EINVAL;
+ err = proc_parse_subset_param(fc, param->string);
+ if (err)
+ return err;
+ break;
+
+ case Opt_pidns:
+#ifdef CONFIG_PID_NS
+ /*
+ * We would have to RCU-protect every proc_pid_ns() or
+ * proc_sb_info() access if we allowed this to be reconfigured
+ * for an existing procfs instance. Luckily, procfs instances
+ * are cheap to create, and mount-beneath would let you
+ * atomically replace an instance even with overmounts.
+ */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ errorfc(fc, "cannot reconfigure pidns for existing procfs");
+ return -EBUSY;
+ }
+ err = proc_parse_pidns_param(fc, param, &result);
+ if (err)
+ return err;
break;
+#else
+ errorfc(fc, "pidns mount flag not supported on this system");
+ return -EOPNOTSUPP;
+#endif
default:
return -EINVAL;
@@ -154,6 +235,11 @@ static void proc_apply_options(struct proc_fs_info *fs_info,
fs_info->hide_pid = ctx->hidepid;
if (ctx->mask & (1 << Opt_subset))
fs_info->pidonly = ctx->pidonly;
+ if (ctx->mask & (1 << Opt_pidns) &&
+ !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) {
+ put_pid_ns(fs_info->pid_ns);
+ fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
+ }
}
static int proc_fill_super(struct super_block *s, struct fs_context *fc)
@@ -188,7 +274,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
/* procfs dentries and inodes don't require IO to create */
- s->s_shrink.seeks = 0;
+ s->s_shrink->seeks = 0;
pde_get(&proc_root);
root_inode = proc_get_inode(s, &proc_root);
@@ -261,17 +347,11 @@ static void proc_kill_sb(struct super_block *sb)
{
struct proc_fs_info *fs_info = proc_sb_info(sb);
- if (!fs_info) {
- kill_anon_super(sb);
- return;
- }
-
- dput(fs_info->proc_self);
- dput(fs_info->proc_thread_self);
-
kill_anon_super(sb);
- put_pid_ns(fs_info->pid_ns);
- kfree(fs_info);
+ if (fs_info) {
+ put_pid_ns(fs_info->pid_ns);
+ kfree_rcu(fs_info, rcu);
+ }
}
static struct file_system_type proc_fs_type = {
@@ -314,7 +394,8 @@ static int proc_root_getattr(struct mnt_idmap *idmap,
const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
- generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry),
+ stat);
stat->nlink = proc_root.nlink + nr_processes();
return 0;
}
@@ -362,12 +443,12 @@ static const struct inode_operations proc_root_inode_operations = {
* This is the root "inode" in the /proc tree..
*/
struct proc_dir_entry proc_root = {
- .low_ino = PROC_ROOT_INO,
- .namelen = 5,
- .mode = S_IFDIR | S_IRUGO | S_IXUGO,
- .nlink = 2,
+ .low_ino = PROCFS_ROOT_INO,
+ .namelen = 5,
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ .nlink = 2,
.refcnt = REFCOUNT_INIT(1),
- .proc_iops = &proc_root_inode_operations,
+ .proc_iops = &proc_root_inode_operations,
.proc_dir_ops = &proc_root_operations,
.parent = &proc_root,
.subdir = RB_ROOT,
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 72cd69bcaf4a..62d2c0cfe35c 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -31,12 +31,11 @@ static const struct inode_operations proc_self_inode_operations = {
.get_link = proc_self_get_link,
};
-static unsigned self_inum __ro_after_init;
+unsigned self_inum __ro_after_init;
int proc_setup_self(struct super_block *s)
{
struct inode *root_inode = d_inode(s->s_root);
- struct proc_fs_info *fs_info = proc_sb_info(s);
struct dentry *self;
int ret = -ENOMEM;
@@ -46,23 +45,20 @@ int proc_setup_self(struct super_block *s)
struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = self_inum;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ simple_inode_init_ts(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_op = &proc_self_inode_operations;
- d_add(self, inode);
+ d_make_persistent(self, inode);
ret = 0;
- } else {
- dput(self);
}
+ dput(self);
}
inode_unlock(root_inode);
if (ret)
pr_err("proc_fill_super: can't allocate /proc/self\n");
- else
- fs_info->proc_self = self;
return ret;
}
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index f4616083faef..04bb29721419 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -20,7 +20,7 @@ static int show_softirqs(struct seq_file *p, void *v)
for (i = 0; i < NR_SOFTIRQS; i++) {
seq_printf(p, "%12s:", softirq_to_name[i]);
for_each_possible_cpu(j)
- seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
+ seq_put_decimal_ull_width(p, " ", kstat_softirqs_cpu(i, j), 10);
seq_putc(p, '\n');
}
return 0;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index da60956b2915..8b444e862319 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -76,7 +76,7 @@ static void show_all_irqs(struct seq_file *p)
seq_put_decimal_ull(p, " ", kstat_irqs_usr(i));
next = i + 1;
}
- show_irq_gap(p, nr_irqs - next);
+ show_irq_gap(p, irq_get_nr_irqs() - next);
}
static int show_stat(struct seq_file *p, void *v)
@@ -196,7 +196,7 @@ static int stat_open(struct inode *inode, struct file *file)
unsigned int size = 1024 + 128 * num_online_cpus();
/* minimum size to display an interrupt count : 2 bytes */
- size += 2 * nr_irqs;
+ size += 2 * irq_get_nr_irqs();
return single_open_size(file, show_stat, NULL, size);
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 507cd4e59d07..81dfc26bfae8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -4,6 +4,7 @@
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/mount.h>
+#include <linux/ksm.h>
#include <linux/seq_file.h>
#include <linux/highmem.h>
#include <linux/ptrace.h>
@@ -13,18 +14,24 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/sched/mm.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
#include <linux/uaccess.h>
#include <linux/pkeys.h>
+#include <linux/minmax.h>
+#include <linux/overflow.h>
+#include <linux/buildid.h>
#include <asm/elf.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include "internal.h"
+#define SENTINEL_VMA_END -1
+#define SENTINEL_VMA_GATE -2
+
#define SEQ_PUT_DEC(str, val) \
seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8)
void task_mem(struct seq_file *m, struct mm_struct *mm)
@@ -32,9 +39,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
unsigned long text, lib, swap, anon, file, shmem;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
- anon = get_mm_counter(mm, MM_ANONPAGES);
- file = get_mm_counter(mm, MM_FILEPAGES);
- shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+ anon = get_mm_counter_sum(mm, MM_ANONPAGES);
+ file = get_mm_counter_sum(mm, MM_FILEPAGES);
+ shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES);
/*
* Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -55,7 +62,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
text = min(text, mm->exec_vm << PAGE_SHIFT);
lib = (mm->exec_vm << PAGE_SHIFT) - text;
- swap = get_mm_counter(mm, MM_SWAPENTS);
+ swap = get_mm_counter_sum(mm, MM_SWAPENTS);
SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
@@ -88,12 +95,12 @@ unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
- *shared = get_mm_counter(mm, MM_FILEPAGES) +
- get_mm_counter(mm, MM_SHMEMPAGES);
+ *shared = get_mm_counter_sum(mm, MM_FILEPAGES) +
+ get_mm_counter_sum(mm, MM_SHMEMPAGES);
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
*data = mm->data_vm + mm->stack_vm;
- *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
+ *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES);
return mm->total_vm;
}
@@ -123,16 +130,143 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
}
#endif
-static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
- loff_t *ppos)
+#ifdef CONFIG_PER_VMA_LOCK
+
+static void reset_lock_ctx(struct proc_maps_locking_ctx *lock_ctx)
+{
+ lock_ctx->locked_vma = NULL;
+ lock_ctx->mmap_locked = false;
+}
+
+static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx)
+{
+ if (lock_ctx->locked_vma) {
+ vma_end_read(lock_ctx->locked_vma);
+ lock_ctx->locked_vma = NULL;
+ }
+}
+
+static const struct seq_operations proc_pid_maps_op;
+
+static inline bool lock_vma_range(struct seq_file *m,
+ struct proc_maps_locking_ctx *lock_ctx)
+{
+ /*
+ * smaps and numa_maps perform page table walk, therefore require
+ * mmap_lock but maps can be read with locking just the vma and
+ * walking the vma tree under rcu read protection.
+ */
+ if (m->op != &proc_pid_maps_op) {
+ if (mmap_read_lock_killable(lock_ctx->mm))
+ return false;
+
+ lock_ctx->mmap_locked = true;
+ } else {
+ rcu_read_lock();
+ reset_lock_ctx(lock_ctx);
+ }
+
+ return true;
+}
+
+static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
+{
+ if (lock_ctx->mmap_locked) {
+ mmap_read_unlock(lock_ctx->mm);
+ } else {
+ unlock_ctx_vma(lock_ctx);
+ rcu_read_unlock();
+ }
+}
+
+static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
+ loff_t last_pos)
{
- struct vm_area_struct *vma = vma_next(&priv->iter);
+ struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
+ struct vm_area_struct *vma;
+
+ if (lock_ctx->mmap_locked)
+ return vma_next(&priv->iter);
+
+ unlock_ctx_vma(lock_ctx);
+ vma = lock_next_vma(lock_ctx->mm, &priv->iter, last_pos);
+ if (!IS_ERR_OR_NULL(vma))
+ lock_ctx->locked_vma = vma;
+
+ return vma;
+}
+
+static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
+ loff_t pos)
+{
+ struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
+
+ if (lock_ctx->mmap_locked)
+ return false;
+
+ rcu_read_unlock();
+ mmap_read_lock(lock_ctx->mm);
+ /* Reinitialize the iterator after taking mmap_lock */
+ vma_iter_set(&priv->iter, pos);
+ lock_ctx->mmap_locked = true;
+
+ return true;
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool lock_vma_range(struct seq_file *m,
+ struct proc_maps_locking_ctx *lock_ctx)
+{
+ return mmap_read_lock_killable(lock_ctx->mm) == 0;
+}
+
+static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
+{
+ mmap_read_unlock(lock_ctx->mm);
+}
+
+static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
+ loff_t last_pos)
+{
+ return vma_next(&priv->iter);
+}
+static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
+ loff_t pos)
+{
+ return false;
+}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
+static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos)
+{
+ struct proc_maps_private *priv = m->private;
+ struct vm_area_struct *vma;
+
+retry:
+ vma = get_next_vma(priv, *ppos);
+ /* EINTR of EAGAIN is possible */
+ if (IS_ERR(vma)) {
+ if (PTR_ERR(vma) == -EAGAIN && fallback_to_mmap_lock(priv, *ppos))
+ goto retry;
+
+ return vma;
+ }
+
+ /* Store previous position to be able to restart if needed */
+ priv->last_pos = *ppos;
if (vma) {
- *ppos = vma->vm_start;
+ /*
+ * Track the end of the reported vma to ensure position changes
+ * even if previous vma was merged with the next vma and we
+ * found the extended vma with the same vm_start.
+ */
+ *ppos = vma->vm_end;
} else {
- *ppos = -2UL;
- vma = get_gate_vma(priv->mm);
+ *ppos = SENTINEL_VMA_GATE;
+ vma = get_gate_vma(priv->lock_ctx.mm);
}
return vma;
@@ -141,58 +275,66 @@ static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
- unsigned long last_addr = *ppos;
+ struct proc_maps_locking_ctx *lock_ctx;
+ loff_t last_addr = *ppos;
struct mm_struct *mm;
/* See m_next(). Zero at the start or after lseek. */
- if (last_addr == -1UL)
+ if (last_addr == SENTINEL_VMA_END)
return NULL;
priv->task = get_proc_task(priv->inode);
if (!priv->task)
return ERR_PTR(-ESRCH);
- mm = priv->mm;
+ lock_ctx = &priv->lock_ctx;
+ mm = lock_ctx->mm;
if (!mm || !mmget_not_zero(mm)) {
put_task_struct(priv->task);
priv->task = NULL;
return NULL;
}
- if (mmap_read_lock_killable(mm)) {
+ if (!lock_vma_range(m, lock_ctx)) {
mmput(mm);
put_task_struct(priv->task);
priv->task = NULL;
return ERR_PTR(-EINTR);
}
- vma_iter_init(&priv->iter, mm, last_addr);
+ /*
+ * Reset current position if last_addr was set before
+ * and it's not a sentinel.
+ */
+ if (last_addr > 0)
+ *ppos = last_addr = priv->last_pos;
+ vma_iter_init(&priv->iter, mm, (unsigned long)last_addr);
hold_task_mempolicy(priv);
- if (last_addr == -2UL)
+ if (last_addr == SENTINEL_VMA_GATE)
return get_gate_vma(mm);
- return proc_get_vma(priv, ppos);
+ return proc_get_vma(m, ppos);
}
static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
{
- if (*ppos == -2UL) {
- *ppos = -1UL;
+ if (*ppos == SENTINEL_VMA_GATE) {
+ *ppos = SENTINEL_VMA_END;
return NULL;
}
- return proc_get_vma(m->private, ppos);
+ return proc_get_vma(m, ppos);
}
static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
- struct mm_struct *mm = priv->mm;
+ struct mm_struct *mm = priv->lock_ctx.mm;
if (!priv->task)
return;
release_task_mempolicy(priv);
- mmap_read_unlock(mm);
+ unlock_vma_range(&priv->lock_ctx);
mmput(mm);
put_task_struct(priv->task);
priv->task = NULL;
@@ -207,9 +349,9 @@ static int proc_maps_open(struct inode *inode, struct file *file,
return -ENOMEM;
priv->inode = inode;
- priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(priv->mm)) {
- int err = PTR_ERR(priv->mm);
+ priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(priv->lock_ctx.mm)) {
+ int err = PTR_ERR(priv->lock_ctx.mm);
seq_release_private(inode, file);
return err;
@@ -223,8 +365,8 @@ static int proc_map_release(struct inode *inode, struct file *file)
struct seq_file *seq = file->private_data;
struct proc_maps_private *priv = seq->private;
- if (priv->mm)
- mmdrop(priv->mm);
+ if (priv->lock_ctx.mm)
+ mmdrop(priv->lock_ctx.mm);
return seq_release_private(inode, file);
}
@@ -236,19 +378,65 @@ static int do_maps_open(struct inode *inode, struct file *file,
sizeof(struct proc_maps_private));
}
-/*
- * Indicate if the VMA is a stack for the given task; for
- * /proc/PID/maps that is the stack of the main task.
- */
-static int is_stack(struct vm_area_struct *vma)
+static void get_vma_name(struct vm_area_struct *vma,
+ const struct path **path,
+ const char **name,
+ const char **name_fmt)
{
+ struct anon_vma_name *anon_name = vma->vm_mm ? anon_vma_name(vma) : NULL;
+
+ *name = NULL;
+ *path = NULL;
+ *name_fmt = NULL;
+
/*
- * We make no effort to guess what a given thread considers to be
- * its "stack". It's not even well-defined for programs written
- * languages like Go.
+ * Print the dentry name for named mappings, and a
+ * special [heap] marker for the heap:
*/
- return vma->vm_start <= vma->vm_mm->start_stack &&
- vma->vm_end >= vma->vm_mm->start_stack;
+ if (vma->vm_file) {
+ /*
+ * If user named this anon shared memory via
+ * prctl(PR_SET_VMA ..., use the provided name.
+ */
+ if (anon_name) {
+ *name_fmt = "[anon_shmem:%s]";
+ *name = anon_name->name;
+ } else {
+ *path = file_user_path(vma->vm_file);
+ }
+ return;
+ }
+
+ if (vma->vm_ops && vma->vm_ops->name) {
+ *name = vma->vm_ops->name(vma);
+ if (*name)
+ return;
+ }
+
+ *name = arch_vma_name(vma);
+ if (*name)
+ return;
+
+ if (!vma->vm_mm) {
+ *name = "[vdso]";
+ return;
+ }
+
+ if (vma_is_initial_heap(vma)) {
+ *name = "[heap]";
+ return;
+ }
+
+ if (vma_is_initial_stack(vma)) {
+ *name = "[stack]";
+ return;
+ }
+
+ if (anon_name) {
+ *name_fmt = "[anon:%s]";
+ *name = anon_name->name;
+ return;
+ }
}
static void show_vma_header_prefix(struct seq_file *m,
@@ -274,18 +462,17 @@ static void show_vma_header_prefix(struct seq_file *m,
static void
show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
{
- struct anon_vma_name *anon_name = NULL;
- struct mm_struct *mm = vma->vm_mm;
- struct file *file = vma->vm_file;
+ const struct path *path;
+ const char *name_fmt, *name;
vm_flags_t flags = vma->vm_flags;
unsigned long ino = 0;
unsigned long long pgoff = 0;
unsigned long start, end;
dev_t dev = 0;
- const char *name = NULL;
- if (file) {
- struct inode *inode = file_inode(vma->vm_file);
+ if (vma->vm_file) {
+ const struct inode *inode = file_user_inode(vma->vm_file);
+
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
@@ -294,58 +481,15 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
start = vma->vm_start;
end = vma->vm_end;
show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
- if (mm)
- anon_name = anon_vma_name(vma);
- /*
- * Print the dentry name for named mappings, and a
- * special [heap] marker for the heap:
- */
- if (file) {
+ get_vma_name(vma, &path, &name, &name_fmt);
+ if (path) {
seq_pad(m, ' ');
- /*
- * If user named this anon shared memory via
- * prctl(PR_SET_VMA ..., use the provided name.
- */
- if (anon_name)
- seq_printf(m, "[anon_shmem:%s]", anon_name->name);
- else
- seq_file_path(m, file, "\n");
- goto done;
- }
-
- if (vma->vm_ops && vma->vm_ops->name) {
- name = vma->vm_ops->name(vma);
- if (name)
- goto done;
- }
-
- name = arch_vma_name(vma);
- if (!name) {
- if (!mm) {
- name = "[vdso]";
- goto done;
- }
-
- if (vma->vm_start <= mm->brk &&
- vma->vm_end >= mm->start_brk) {
- name = "[heap]";
- goto done;
- }
-
- if (is_stack(vma)) {
- name = "[stack]";
- goto done;
- }
-
- if (anon_name) {
- seq_pad(m, ' ');
- seq_printf(m, "[anon:%s]", anon_name->name);
- }
- }
-
-done:
- if (name) {
+ seq_path(m, path, "\n");
+ } else if (name_fmt) {
+ seq_pad(m, ' ');
+ seq_printf(m, name_fmt, name);
+ } else if (name) {
seq_pad(m, ' ');
seq_puts(m, name);
}
@@ -370,11 +514,315 @@ static int pid_maps_open(struct inode *inode, struct file *file)
return do_maps_open(inode, file, &proc_pid_maps_op);
}
+#define PROCMAP_QUERY_VMA_FLAGS ( \
+ PROCMAP_QUERY_VMA_READABLE | \
+ PROCMAP_QUERY_VMA_WRITABLE | \
+ PROCMAP_QUERY_VMA_EXECUTABLE | \
+ PROCMAP_QUERY_VMA_SHARED \
+)
+
+#define PROCMAP_QUERY_VALID_FLAGS_MASK ( \
+ PROCMAP_QUERY_COVERING_OR_NEXT_VMA | \
+ PROCMAP_QUERY_FILE_BACKED_VMA | \
+ PROCMAP_QUERY_VMA_FLAGS \
+)
+
+#ifdef CONFIG_PER_VMA_LOCK
+
+static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx)
+{
+ reset_lock_ctx(lock_ctx);
+
+ return 0;
+}
+
+static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx)
+{
+ if (lock_ctx->mmap_locked) {
+ mmap_read_unlock(lock_ctx->mm);
+ lock_ctx->mmap_locked = false;
+ } else {
+ unlock_ctx_vma(lock_ctx);
+ }
+}
+
+static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx,
+ unsigned long addr)
+{
+ struct mm_struct *mm = lock_ctx->mm;
+ struct vm_area_struct *vma;
+ struct vma_iterator vmi;
+
+ if (lock_ctx->mmap_locked)
+ return find_vma(mm, addr);
+
+ /* Unlock previously locked VMA and find the next one under RCU */
+ unlock_ctx_vma(lock_ctx);
+ rcu_read_lock();
+ vma_iter_init(&vmi, mm, addr);
+ vma = lock_next_vma(mm, &vmi, addr);
+ rcu_read_unlock();
+
+ if (!vma)
+ return NULL;
+
+ if (!IS_ERR(vma)) {
+ lock_ctx->locked_vma = vma;
+ return vma;
+ }
+
+ if (PTR_ERR(vma) == -EAGAIN) {
+ /* Fallback to mmap_lock on vma->vm_refcnt overflow */
+ mmap_read_lock(mm);
+ vma = find_vma(mm, addr);
+ lock_ctx->mmap_locked = true;
+ }
+
+ return vma;
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx)
+{
+ return mmap_read_lock_killable(lock_ctx->mm);
+}
+
+static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx)
+{
+ mmap_read_unlock(lock_ctx->mm);
+}
+
+static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx,
+ unsigned long addr)
+{
+ return find_vma(lock_ctx->mm, addr);
+}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
+static struct vm_area_struct *query_matching_vma(struct proc_maps_locking_ctx *lock_ctx,
+ unsigned long addr, u32 flags)
+{
+ struct vm_area_struct *vma;
+
+next_vma:
+ vma = query_vma_find_by_addr(lock_ctx, addr);
+ if (IS_ERR(vma))
+ return vma;
+
+ if (!vma)
+ goto no_vma;
+
+ /* user requested only file-backed VMA, keep iterating */
+ if ((flags & PROCMAP_QUERY_FILE_BACKED_VMA) && !vma->vm_file)
+ goto skip_vma;
+
+ /* VMA permissions should satisfy query flags */
+ if (flags & PROCMAP_QUERY_VMA_FLAGS) {
+ u32 perm = 0;
+
+ if (flags & PROCMAP_QUERY_VMA_READABLE)
+ perm |= VM_READ;
+ if (flags & PROCMAP_QUERY_VMA_WRITABLE)
+ perm |= VM_WRITE;
+ if (flags & PROCMAP_QUERY_VMA_EXECUTABLE)
+ perm |= VM_EXEC;
+ if (flags & PROCMAP_QUERY_VMA_SHARED)
+ perm |= VM_MAYSHARE;
+
+ if ((vma->vm_flags & perm) != perm)
+ goto skip_vma;
+ }
+
+ /* found covering VMA or user is OK with the matching next VMA */
+ if ((flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) || vma->vm_start <= addr)
+ return vma;
+
+skip_vma:
+ /*
+ * If the user needs closest matching VMA, keep iterating.
+ */
+ addr = vma->vm_end;
+ if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA)
+ goto next_vma;
+
+no_vma:
+ return ERR_PTR(-ENOENT);
+}
+
+static int do_procmap_query(struct mm_struct *mm, void __user *uarg)
+{
+ struct proc_maps_locking_ctx lock_ctx = { .mm = mm };
+ struct procmap_query karg;
+ struct vm_area_struct *vma;
+ const char *name = NULL;
+ char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL;
+ __u64 usize;
+ int err;
+
+ if (copy_from_user(&usize, (void __user *)uarg, sizeof(usize)))
+ return -EFAULT;
+ /* argument struct can never be that large, reject abuse */
+ if (usize > PAGE_SIZE)
+ return -E2BIG;
+ /* argument struct should have at least query_flags and query_addr fields */
+ if (usize < offsetofend(struct procmap_query, query_addr))
+ return -EINVAL;
+ err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize);
+ if (err)
+ return err;
+
+ /* reject unknown flags */
+ if (karg.query_flags & ~PROCMAP_QUERY_VALID_FLAGS_MASK)
+ return -EINVAL;
+ /* either both buffer address and size are set, or both should be zero */
+ if (!!karg.vma_name_size != !!karg.vma_name_addr)
+ return -EINVAL;
+ if (!!karg.build_id_size != !!karg.build_id_addr)
+ return -EINVAL;
+
+ if (!mm || !mmget_not_zero(mm))
+ return -ESRCH;
+
+ err = query_vma_setup(&lock_ctx);
+ if (err) {
+ mmput(mm);
+ return err;
+ }
+
+ vma = query_matching_vma(&lock_ctx, karg.query_addr, karg.query_flags);
+ if (IS_ERR(vma)) {
+ err = PTR_ERR(vma);
+ vma = NULL;
+ goto out;
+ }
+
+ karg.vma_start = vma->vm_start;
+ karg.vma_end = vma->vm_end;
+
+ karg.vma_flags = 0;
+ if (vma->vm_flags & VM_READ)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_READABLE;
+ if (vma->vm_flags & VM_WRITE)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_WRITABLE;
+ if (vma->vm_flags & VM_EXEC)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_EXECUTABLE;
+ if (vma->vm_flags & VM_MAYSHARE)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_SHARED;
+
+ karg.vma_page_size = vma_kernel_pagesize(vma);
+
+ if (vma->vm_file) {
+ const struct inode *inode = file_user_inode(vma->vm_file);
+
+ karg.vma_offset = ((__u64)vma->vm_pgoff) << PAGE_SHIFT;
+ karg.dev_major = MAJOR(inode->i_sb->s_dev);
+ karg.dev_minor = MINOR(inode->i_sb->s_dev);
+ karg.inode = inode->i_ino;
+ } else {
+ karg.vma_offset = 0;
+ karg.dev_major = 0;
+ karg.dev_minor = 0;
+ karg.inode = 0;
+ }
+
+ if (karg.build_id_size) {
+ __u32 build_id_sz;
+
+ err = build_id_parse(vma, build_id_buf, &build_id_sz);
+ if (err) {
+ karg.build_id_size = 0;
+ } else {
+ if (karg.build_id_size < build_id_sz) {
+ err = -ENAMETOOLONG;
+ goto out;
+ }
+ karg.build_id_size = build_id_sz;
+ }
+ }
+
+ if (karg.vma_name_size) {
+ size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size);
+ const struct path *path;
+ const char *name_fmt;
+ size_t name_sz = 0;
+
+ get_vma_name(vma, &path, &name, &name_fmt);
+
+ if (path || name_fmt || name) {
+ name_buf = kmalloc(name_buf_sz, GFP_KERNEL);
+ if (!name_buf) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+ if (path) {
+ name = d_path(path, name_buf, name_buf_sz);
+ if (IS_ERR(name)) {
+ err = PTR_ERR(name);
+ goto out;
+ }
+ name_sz = name_buf + name_buf_sz - name;
+ } else if (name || name_fmt) {
+ name_sz = 1 + snprintf(name_buf, name_buf_sz, name_fmt ?: "%s", name);
+ name = name_buf;
+ }
+ if (name_sz > name_buf_sz) {
+ err = -ENAMETOOLONG;
+ goto out;
+ }
+ karg.vma_name_size = name_sz;
+ }
+
+ /* unlock vma or mmap_lock, and put mm_struct before copying data to user */
+ query_vma_teardown(&lock_ctx);
+ mmput(mm);
+
+ if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr),
+ name, karg.vma_name_size)) {
+ kfree(name_buf);
+ return -EFAULT;
+ }
+ kfree(name_buf);
+
+ if (karg.build_id_size && copy_to_user(u64_to_user_ptr(karg.build_id_addr),
+ build_id_buf, karg.build_id_size))
+ return -EFAULT;
+
+ if (copy_to_user(uarg, &karg, min_t(size_t, sizeof(karg), usize)))
+ return -EFAULT;
+
+ return 0;
+
+out:
+ query_vma_teardown(&lock_ctx);
+ mmput(mm);
+ kfree(name_buf);
+ return err;
+}
+
+static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct seq_file *seq = file->private_data;
+ struct proc_maps_private *priv = seq->private;
+
+ switch (cmd) {
+ case PROCMAP_QUERY:
+ /* priv->lock_ctx.mm is set during file open operation */
+ return do_procmap_query(priv->lock_ctx.mm, (void __user *)arg);
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
const struct file_operations proc_pid_maps_operations = {
.open = pid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = proc_map_release,
+ .unlocked_ioctl = procfs_procmap_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
};
/*
@@ -412,6 +860,7 @@ struct mem_size_stats {
unsigned long swap;
unsigned long shared_hugetlb;
unsigned long private_hugetlb;
+ unsigned long ksm;
u64 pss;
u64 pss_anon;
u64 pss_file;
@@ -422,14 +871,14 @@ struct mem_size_stats {
};
static void smaps_page_accumulate(struct mem_size_stats *mss,
- struct page *page, unsigned long size, unsigned long pss,
+ struct folio *folio, unsigned long size, unsigned long pss,
bool dirty, bool locked, bool private)
{
mss->pss += pss;
- if (PageAnon(page))
+ if (folio_test_anon(folio))
mss->pss_anon += pss;
- else if (PageSwapBacked(page))
+ else if (folio_test_swapbacked(folio))
mss->pss_shmem += pss;
else
mss->pss_file += pss;
@@ -437,7 +886,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
if (locked)
mss->pss_locked += pss;
- if (dirty || PageDirty(page)) {
+ if (dirty || folio_test_dirty(folio)) {
mss->pss_dirty += pss;
if (private)
mss->private_dirty += size;
@@ -453,53 +902,76 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
static void smaps_account(struct mem_size_stats *mss, struct page *page,
bool compound, bool young, bool dirty, bool locked,
- bool migration)
+ bool present)
{
+ struct folio *folio = page_folio(page);
int i, nr = compound ? compound_nr(page) : 1;
unsigned long size = nr * PAGE_SIZE;
+ bool exclusive;
+ int mapcount;
/*
* First accumulate quantities that depend only on |size| and the type
* of the compound page.
*/
- if (PageAnon(page)) {
+ if (folio_test_anon(folio)) {
mss->anonymous += size;
- if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
+ if (!folio_test_swapbacked(folio) && !dirty &&
+ !folio_test_dirty(folio))
mss->lazyfree += size;
}
+ if (folio_test_ksm(folio))
+ mss->ksm += size;
+
mss->resident += size;
/* Accumulate the size in pages that have been accessed. */
- if (young || page_is_young(page) || PageReferenced(page))
+ if (young || folio_test_young(folio) || folio_test_referenced(folio))
mss->referenced += size;
/*
* Then accumulate quantities that may depend on sharing, or that may
* differ page-by-page.
*
- * page_count(page) == 1 guarantees the page is mapped exactly once.
- * If any subpage of the compound page mapped with PTE it would elevate
- * page_count().
+ * refcount == 1 for present entries guarantees that the folio is mapped
+ * exactly once. For large folios this implies that exactly one
+ * PTE/PMD/... maps (a part of) this folio.
+ *
+ * Treat all non-present entries (where relying on the mapcount and
+ * refcount doesn't make sense) as "maybe shared, but not sure how
+ * often". We treat device private entries as being fake-present.
*
- * The page_mapcount() is called to get a snapshot of the mapcount.
- * Without holding the page lock this snapshot can be slightly wrong as
- * we cannot always read the mapcount atomically. It is not safe to
- * call page_mapcount() even with PTL held if the page is not mapped,
- * especially for migration entries. Treat regular migration entries
- * as mapcount == 1.
+ * Note that it would not be safe to read the mapcount especially for
+ * pages referenced by migration entries, even with the PTL held.
*/
- if ((page_count(page) == 1) || migration) {
- smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
- locked, true);
+ if (folio_ref_count(folio) == 1 || !present) {
+ smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT,
+ dirty, locked, present);
return;
}
+
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ mapcount = folio_average_page_mapcount(folio);
+ exclusive = !folio_maybe_mapped_shared(folio);
+ }
+
+ /*
+ * We obtain a snapshot of the mapcount. Without holding the folio lock
+ * this snapshot can be slightly wrong as we cannot always read the
+ * mapcount atomically.
+ */
for (i = 0; i < nr; i++, page++) {
- int mapcount = page_mapcount(page);
unsigned long pss = PAGE_SIZE << PSS_SHIFT;
+
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) {
+ mapcount = folio_precise_page_mapcount(folio, page);
+ exclusive = mapcount < 2;
+ }
+
if (mapcount >= 2)
pss /= mapcount;
- smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked,
- mapcount < 2);
+ smaps_page_accumulate(mss, folio, PAGE_SIZE, pss,
+ dirty, locked, exclusive);
}
}
@@ -537,21 +1009,24 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
- bool migration = false, young = false, dirty = false;
+ bool present = false, young = false, dirty = false;
pte_t ptent = ptep_get(pte);
if (pte_present(ptent)) {
page = vm_normal_page(vma, addr, ptent);
young = pte_young(ptent);
dirty = pte_dirty(ptent);
- } else if (is_swap_pte(ptent)) {
- swp_entry_t swpent = pte_to_swp_entry(ptent);
+ present = true;
+ } else if (pte_none(ptent)) {
+ smaps_pte_hole_lookup(addr, walk);
+ } else {
+ const softleaf_t entry = softleaf_from_pte(ptent);
- if (!non_swap_entry(swpent)) {
+ if (softleaf_is_swap(entry)) {
int mapcount;
mss->swap += PAGE_SIZE;
- mapcount = swp_swapcount(swpent);
+ mapcount = swp_swapcount(entry);
if (mapcount >= 2) {
u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
@@ -560,20 +1035,17 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else {
mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
}
- } else if (is_pfn_swap_entry(swpent)) {
- if (is_migration_entry(swpent))
- migration = true;
- page = pfn_swap_entry_to_page(swpent);
+ } else if (softleaf_has_pfn(entry)) {
+ if (softleaf_is_device_private(entry))
+ present = true;
+ page = softleaf_to_page(entry);
}
- } else {
- smaps_pte_hole_lookup(addr, walk);
- return;
}
if (!page)
return;
- smaps_account(mss, page, false, young, dirty, locked, migration);
+ smaps_account(mss, page, false, young, dirty, locked, present);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -584,32 +1056,34 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
- bool migration = false;
+ bool present = false;
+ struct folio *folio;
+ if (pmd_none(*pmd))
+ return;
if (pmd_present(*pmd)) {
- /* FOLL_DUMP will return -EFAULT on huge zero page */
- page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
- } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
- swp_entry_t entry = pmd_to_swp_entry(*pmd);
-
- if (is_migration_entry(entry)) {
- migration = true;
- page = pfn_swap_entry_to_page(entry);
- }
+ page = vm_normal_page_pmd(vma, addr, *pmd);
+ present = true;
+ } else if (unlikely(thp_migration_supported())) {
+ const softleaf_t entry = softleaf_from_pmd(*pmd);
+
+ if (softleaf_has_pfn(entry))
+ page = softleaf_to_page(entry);
}
if (IS_ERR_OR_NULL(page))
return;
- if (PageAnon(page))
+ folio = page_folio(page);
+ if (folio_test_anon(folio))
mss->anonymous_thp += HPAGE_PMD_SIZE;
- else if (PageSwapBacked(page))
+ else if (folio_test_swapbacked(folio))
mss->shmem_thp += HPAGE_PMD_SIZE;
- else if (is_zone_device_page(page))
+ else if (folio_is_zone_device(folio))
/* pass */;
else
mss->file_thp += HPAGE_PMD_SIZE;
smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
- locked, migration);
+ locked, present);
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -649,8 +1123,15 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
{
/*
* Don't forget to update Documentation/ on changes.
+ *
+ * The length of the second argument of mnemonics[]
+ * needs to be 3 instead of previously set 2
+ * (i.e. from [BITS_PER_LONG][2] to [BITS_PER_LONG][3])
+ * to avoid spurious
+ * -Werror=unterminated-string-initialization warning
+ * with GCC 15
*/
- static const char mnemonics[BITS_PER_LONG][2] = {
+ static const char mnemonics[BITS_PER_LONG][3] = {
/*
* In case if we meet a flag we don't know about.
*/
@@ -666,6 +1147,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_MAYSHARE)] = "ms",
[ilog2(VM_GROWSDOWN)] = "gd",
[ilog2(VM_PFNMAP)] = "pf",
+ [ilog2(VM_MAYBE_GUARD)] = "gu",
[ilog2(VM_LOCKED)] = "lo",
[ilog2(VM_IO)] = "io",
[ilog2(VM_SEQ_READ)] = "sr",
@@ -701,14 +1183,25 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_PKEY_BIT0)] = "",
[ilog2(VM_PKEY_BIT1)] = "",
[ilog2(VM_PKEY_BIT2)] = "",
+#if CONFIG_ARCH_PKEY_BITS > 3
[ilog2(VM_PKEY_BIT3)] = "",
-#if VM_PKEY_BIT4
+#endif
+#if CONFIG_ARCH_PKEY_BITS > 4
[ilog2(VM_PKEY_BIT4)] = "",
#endif
#endif /* CONFIG_ARCH_HAS_PKEYS */
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
[ilog2(VM_UFFD_MINOR)] = "ui",
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK
+ [ilog2(VM_SHADOW_STACK)] = "ss",
+#endif
+#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
+ [ilog2(VM_DROPPABLE)] = "dp",
+#endif
+#ifdef CONFIG_64BIT
+ [ilog2(VM_SEALED)] = "sl",
+#endif
};
size_t i;
@@ -716,11 +1209,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
for (i = 0; i < BITS_PER_LONG; i++) {
if (!mnemonics[i][0])
continue;
- if (vma->vm_flags & (1UL << i)) {
- seq_putc(m, mnemonics[i][0]);
- seq_putc(m, mnemonics[i][1]);
- seq_putc(m, ' ');
- }
+ if (vma->vm_flags & (1UL << i))
+ seq_printf(m, "%s ", mnemonics[i]);
}
seq_putc(m, '\n');
}
@@ -732,23 +1222,32 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
- struct page *page = NULL;
- pte_t ptent = ptep_get(pte);
+ struct folio *folio = NULL;
+ bool present = false;
+ spinlock_t *ptl;
+ pte_t ptent;
+ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
+ ptent = huge_ptep_get(walk->mm, addr, pte);
if (pte_present(ptent)) {
- page = vm_normal_page(vma, addr, ptent);
- } else if (is_swap_pte(ptent)) {
- swp_entry_t swpent = pte_to_swp_entry(ptent);
+ folio = page_folio(pte_page(ptent));
+ present = true;
+ } else {
+ const softleaf_t entry = softleaf_from_pte(ptent);
- if (is_pfn_swap_entry(swpent))
- page = pfn_swap_entry_to_page(swpent);
+ if (softleaf_has_pfn(entry))
+ folio = softleaf_to_folio(entry);
}
- if (page) {
- if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte))
+
+ if (folio) {
+ /* We treat non-present entries as "maybe shared". */
+ if (!present || folio_maybe_mapped_shared(folio) ||
+ hugetlb_pmd_shared(pte))
mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
else
mss->private_hugetlb += huge_page_size(hstate_vma(vma));
}
+ spin_unlock(ptl);
return 0;
}
#else
@@ -758,12 +1257,14 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
static const struct mm_walk_ops smaps_walk_ops = {
.pmd_entry = smaps_pte_range,
.hugetlb_entry = smaps_hugetlb_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static const struct mm_walk_ops smaps_shmem_walk_ops = {
.pmd_entry = smaps_pte_range,
.hugetlb_entry = smaps_hugetlb_range,
.pte_hole = smaps_pte_hole,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
@@ -837,6 +1338,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty);
SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced);
SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous);
+ SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm);
SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
@@ -855,9 +1357,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
static int show_smap(struct seq_file *m, void *v)
{
struct vm_area_struct *vma = v;
- struct mem_size_stats mss;
-
- memset(&mss, 0, sizeof(mss));
+ struct mem_size_stats mss = {};
smap_gather_stats(vma, &mss, 0);
@@ -870,8 +1370,9 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
- seq_printf(m, "THPeligible: %d\n",
- hugepage_vma_check(vma, vma->vm_flags, true, false, true));
+ seq_printf(m, "THPeligible: %8u\n",
+ !!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS,
+ THP_ORDERS_ALL));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
@@ -883,8 +1384,8 @@ static int show_smap(struct seq_file *m, void *v)
static int show_smaps_rollup(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
- struct mem_size_stats mss;
- struct mm_struct *mm = priv->mm;
+ struct mem_size_stats mss = {};
+ struct mm_struct *mm = priv->lock_ctx.mm;
struct vm_area_struct *vma;
unsigned long vma_start = 0, last_vma_end = 0;
int ret = 0;
@@ -899,8 +1400,6 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
goto out_put_task;
}
- memset(&mss, 0, sizeof(mss));
-
ret = mmap_read_lock_killable(mm);
if (ret)
goto out_put_mm;
@@ -971,12 +1470,17 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
break;
/* Case 1 and 2 above */
- if (vma->vm_start >= last_vma_end)
+ if (vma->vm_start >= last_vma_end) {
+ smap_gather_stats(vma, &mss, 0);
+ last_vma_end = vma->vm_end;
continue;
+ }
/* Case 4 above */
- if (vma->vm_end > last_vma_end)
+ if (vma->vm_end > last_vma_end) {
smap_gather_stats(vma, &mss, last_vma_end);
+ last_vma_end = vma->vm_end;
+ }
}
} for_each_vma(vmi, vma);
@@ -1026,9 +1530,9 @@ static int smaps_rollup_open(struct inode *inode, struct file *file)
goto out_free;
priv->inode = inode;
- priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(priv->mm)) {
- ret = PTR_ERR(priv->mm);
+ priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) {
+ ret = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH;
single_release(inode, file);
goto out_free;
@@ -1046,8 +1550,8 @@ static int smaps_rollup_release(struct inode *inode, struct file *file)
struct seq_file *seq = file->private_data;
struct proc_maps_private *priv = seq->private;
- if (priv->mm)
- mmdrop(priv->mm);
+ if (priv->lock_ctx.mm)
+ mmdrop(priv->lock_ctx.mm);
kfree(priv);
return single_release(inode, file);
@@ -1080,27 +1584,27 @@ struct clear_refs_private {
enum clear_refs_types type;
};
-#ifdef CONFIG_MEM_SOFT_DIRTY
-
static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{
- struct page *page;
+ struct folio *folio;
if (!pte_write(pte))
return false;
if (!is_cow_mapping(vma->vm_flags))
return false;
- if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
+ if (likely(!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm)))
return false;
- page = vm_normal_page(vma, addr, pte);
- if (!page)
+ folio = vm_normal_folio(vma, addr, pte);
+ if (!folio)
return false;
- return page_maybe_dma_pinned(page);
+ return folio_maybe_dma_pinned(folio);
}
static inline void clear_soft_dirty(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte)
{
+ if (!pgtable_supports_soft_dirty())
+ return;
/*
* The soft-dirty tracker uses #PF-s to catch writes
* to pages, so write-protect the pte as well. See the
@@ -1109,6 +1613,9 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
*/
pte_t ptent = ptep_get(pte);
+ if (pte_none(ptent))
+ return;
+
if (pte_present(ptent)) {
pte_t old_pte;
@@ -1118,24 +1625,21 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
ptent = pte_wrprotect(old_pte);
ptent = pte_clear_soft_dirty(ptent);
ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
- } else if (is_swap_pte(ptent)) {
+ } else {
ptent = pte_swp_clear_soft_dirty(ptent);
set_pte_at(vma->vm_mm, addr, pte, ptent);
}
}
-#else
-static inline void clear_soft_dirty(struct vm_area_struct *vma,
- unsigned long addr, pte_t *pte)
-{
-}
-#endif
-#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
pmd_t old, pmd = *pmdp;
+ if (!pgtable_supports_soft_dirty())
+ return;
+
if (pmd_present(pmd)) {
/* See comment in change_huge_pmd() */
old = pmdp_invalidate(vma, addr, pmdp);
@@ -1148,7 +1652,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
pmd = pmd_clear_soft_dirty(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
- } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+ } else if (pmd_is_migration_entry(pmd)) {
pmd = pmd_swp_clear_soft_dirty(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
@@ -1167,7 +1671,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
pte_t *pte, ptent;
spinlock_t *ptl;
- struct page *page;
+ struct folio *folio;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -1179,12 +1683,12 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
if (!pmd_present(*pmd))
goto out;
- page = pmd_page(*pmd);
+ folio = pmd_folio(*pmd);
/* Clear accessed and referenced bits. */
pmdp_test_and_clear_young(vma, addr, pmd);
- test_and_clear_page_young(page);
- ClearPageReferenced(page);
+ folio_test_clear_young(folio);
+ folio_clear_referenced(folio);
out:
spin_unlock(ptl);
return 0;
@@ -1206,14 +1710,14 @@ out:
if (!pte_present(ptent))
continue;
- page = vm_normal_page(vma, addr, ptent);
- if (!page)
+ folio = vm_normal_folio(vma, addr, ptent);
+ if (!folio)
continue;
/* Clear accessed and referenced bits. */
ptep_test_and_clear_young(vma, addr, pte);
- test_and_clear_page_young(page);
- ClearPageReferenced(page);
+ folio_test_clear_young(folio);
+ folio_clear_referenced(folio);
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
@@ -1245,20 +1749,20 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end,
static const struct mm_walk_ops clear_refs_walk_ops = {
.pmd_entry = clear_refs_pte_range,
.test_walk = clear_refs_test_walk,
+ .walk_lock = PGWALK_WRLOCK,
};
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
struct mm_struct *mm;
struct vm_area_struct *vma;
enum clear_refs_types type;
int itype;
int rv;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
@@ -1347,6 +1851,7 @@ struct pagemapread {
#define PM_SOFT_DIRTY BIT_ULL(55)
#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
#define PM_UFFD_WP BIT_ULL(57)
+#define PM_GUARD_REGION BIT_ULL(58)
#define PM_FILE BIT_ULL(61)
#define PM_SWAP BIT_ULL(62)
#define PM_PRESENT BIT_ULL(63)
@@ -1358,8 +1863,7 @@ static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
}
-static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
- struct pagemapread *pm)
+static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm)
{
pm->buffer[pm->pos++] = *pme;
if (pm->pos >= pm->len)
@@ -1367,6 +1871,13 @@ static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
return 0;
}
+static bool __folio_page_mapped_exclusively(struct folio *folio, struct page *page)
+{
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ return folio_precise_page_mapcount(folio, page) == 1;
+ return !folio_maybe_mapped_shared(folio);
+}
+
static int pagemap_pte_hole(unsigned long start, unsigned long end,
__always_unused int depth, struct mm_walk *walk)
{
@@ -1386,7 +1897,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
hole_end = end;
for (; addr < hole_end; addr += PAGE_SIZE) {
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
goto out;
}
@@ -1398,7 +1909,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
if (vma->vm_flags & VM_SOFTDIRTY)
pme = make_pme(0, PM_SOFT_DIRTY);
for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
goto out;
}
@@ -1412,7 +1923,10 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
{
u64 frame = 0, flags = 0;
struct page *page = NULL;
- bool migration = false;
+ struct folio *folio;
+
+ if (pte_none(pte))
+ goto out;
if (pte_present(pte)) {
if (pm->show_pfn)
@@ -1423,122 +1937,149 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
flags |= PM_SOFT_DIRTY;
if (pte_uffd_wp(pte))
flags |= PM_UFFD_WP;
- } else if (is_swap_pte(pte)) {
- swp_entry_t entry;
+ } else {
+ softleaf_t entry;
+
if (pte_swp_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
if (pte_swp_uffd_wp(pte))
flags |= PM_UFFD_WP;
- entry = pte_to_swp_entry(pte);
+ entry = softleaf_from_pte(pte);
if (pm->show_pfn) {
pgoff_t offset;
+
/*
* For PFN swap offsets, keeping the offset field
* to be PFN only to be compatible with old smaps.
*/
- if (is_pfn_swap_entry(entry))
- offset = swp_offset_pfn(entry);
+ if (softleaf_has_pfn(entry))
+ offset = softleaf_to_pfn(entry);
else
offset = swp_offset(entry);
frame = swp_type(entry) |
(offset << MAX_SWAPFILES_SHIFT);
}
flags |= PM_SWAP;
- migration = is_migration_entry(entry);
- if (is_pfn_swap_entry(entry))
- page = pfn_swap_entry_to_page(entry);
- if (pte_marker_entry_uffd_wp(entry))
+ if (softleaf_has_pfn(entry))
+ page = softleaf_to_page(entry);
+ if (softleaf_is_uffd_wp_marker(entry))
flags |= PM_UFFD_WP;
+ if (softleaf_is_guard_marker(entry))
+ flags |= PM_GUARD_REGION;
}
- if (page && !PageAnon(page))
- flags |= PM_FILE;
- if (page && !migration && page_mapcount(page) == 1)
- flags |= PM_MMAP_EXCLUSIVE;
+ if (page) {
+ folio = page_folio(page);
+ if (!folio_test_anon(folio))
+ flags |= PM_FILE;
+ if ((flags & PM_PRESENT) &&
+ __folio_page_mapped_exclusively(folio, page))
+ flags |= PM_MMAP_EXCLUSIVE;
+ }
+
+out:
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
return make_pme(frame, flags);
}
-static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr,
+ unsigned long end, struct vm_area_struct *vma,
+ struct pagemapread *pm)
{
- struct vm_area_struct *vma = walk->vma;
- struct pagemapread *pm = walk->private;
- spinlock_t *ptl;
- pte_t *pte, *orig_pte;
+ unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT;
+ u64 flags = 0, frame = 0;
+ pmd_t pmd = *pmdp;
+ struct page *page = NULL;
+ struct folio *folio = NULL;
int err = 0;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- bool migration = false;
- ptl = pmd_trans_huge_lock(pmdp, vma);
- if (ptl) {
- u64 flags = 0, frame = 0;
- pmd_t pmd = *pmdp;
- struct page *page = NULL;
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ flags |= PM_SOFT_DIRTY;
- if (vma->vm_flags & VM_SOFTDIRTY)
+ if (pmd_none(pmd))
+ goto populate_pagemap;
+
+ if (pmd_present(pmd)) {
+ page = pmd_page(pmd);
+
+ flags |= PM_PRESENT;
+ if (pmd_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
+ if (pmd_uffd_wp(pmd))
+ flags |= PM_UFFD_WP;
+ if (pm->show_pfn)
+ frame = pmd_pfn(pmd) + idx;
+ } else if (thp_migration_supported()) {
+ const softleaf_t entry = softleaf_from_pmd(pmd);
+ unsigned long offset;
- if (pmd_present(pmd)) {
- page = pmd_page(pmd);
-
- flags |= PM_PRESENT;
- if (pmd_soft_dirty(pmd))
- flags |= PM_SOFT_DIRTY;
- if (pmd_uffd_wp(pmd))
- flags |= PM_UFFD_WP;
- if (pm->show_pfn)
- frame = pmd_pfn(pmd) +
- ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- }
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
- else if (is_swap_pmd(pmd)) {
- swp_entry_t entry = pmd_to_swp_entry(pmd);
- unsigned long offset;
-
- if (pm->show_pfn) {
- if (is_pfn_swap_entry(entry))
- offset = swp_offset_pfn(entry);
- else
- offset = swp_offset(entry);
- offset = offset +
- ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- frame = swp_type(entry) |
- (offset << MAX_SWAPFILES_SHIFT);
- }
- flags |= PM_SWAP;
- if (pmd_swp_soft_dirty(pmd))
- flags |= PM_SOFT_DIRTY;
- if (pmd_swp_uffd_wp(pmd))
- flags |= PM_UFFD_WP;
- VM_BUG_ON(!is_pmd_migration_entry(pmd));
- migration = is_migration_entry(entry);
- page = pfn_swap_entry_to_page(entry);
+ if (pm->show_pfn) {
+ if (softleaf_has_pfn(entry))
+ offset = softleaf_to_pfn(entry) + idx;
+ else
+ offset = swp_offset(entry) + idx;
+ frame = swp_type(entry) |
+ (offset << MAX_SWAPFILES_SHIFT);
}
-#endif
+ flags |= PM_SWAP;
+ if (pmd_swp_soft_dirty(pmd))
+ flags |= PM_SOFT_DIRTY;
+ if (pmd_swp_uffd_wp(pmd))
+ flags |= PM_UFFD_WP;
+ VM_WARN_ON_ONCE(!pmd_is_migration_entry(pmd));
+ page = softleaf_to_page(entry);
+ }
- if (page && !migration && page_mapcount(page) == 1)
- flags |= PM_MMAP_EXCLUSIVE;
+ if (page) {
+ folio = page_folio(page);
+ if (!folio_test_anon(folio))
+ flags |= PM_FILE;
+ }
+
+populate_pagemap:
+ for (; addr != end; addr += PAGE_SIZE, idx++) {
+ u64 cur_flags = flags;
+ pagemap_entry_t pme;
- for (; addr != end; addr += PAGE_SIZE) {
- pagemap_entry_t pme = make_pme(frame, flags);
+ if (folio && (flags & PM_PRESENT) &&
+ __folio_page_mapped_exclusively(folio, page))
+ cur_flags |= PM_MMAP_EXCLUSIVE;
- err = add_to_pagemap(addr, &pme, pm);
- if (err)
- break;
- if (pm->show_pfn) {
- if (flags & PM_PRESENT)
- frame++;
- else if (flags & PM_SWAP)
- frame += (1 << MAX_SWAPFILES_SHIFT);
- }
+ pme = make_pme(frame, cur_flags);
+ err = add_to_pagemap(&pme, pm);
+ if (err)
+ break;
+ if (pm->show_pfn) {
+ if (flags & PM_PRESENT)
+ frame++;
+ else if (flags & PM_SWAP)
+ frame += (1 << MAX_SWAPFILES_SHIFT);
}
+ }
+ return err;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+ struct pagemapread *pm = walk->private;
+ spinlock_t *ptl;
+ pte_t *pte, *orig_pte;
+ int err = 0;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ ptl = pmd_trans_huge_lock(pmdp, vma);
+ if (ptl) {
+ err = pagemap_pmd_range_thp(pmdp, addr, end, vma, pm);
spin_unlock(ptl);
return err;
}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
/*
* We can assume that @vma always points to a valid one and @end never
@@ -1553,7 +2094,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
pagemap_entry_t pme;
pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte));
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
break;
}
@@ -1573,20 +2114,23 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
struct pagemapread *pm = walk->private;
struct vm_area_struct *vma = walk->vma;
u64 flags = 0, frame = 0;
+ spinlock_t *ptl;
int err = 0;
pte_t pte;
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
- pte = huge_ptep_get(ptep);
+ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, ptep);
+ pte = huge_ptep_get(walk->mm, addr, ptep);
if (pte_present(pte)) {
- struct page *page = pte_page(pte);
+ struct folio *folio = page_folio(pte_page(pte));
- if (!PageAnon(page))
+ if (!folio_test_anon(folio))
flags |= PM_FILE;
- if (page_mapcount(page) == 1)
+ if (!folio_maybe_mapped_shared(folio) &&
+ !hugetlb_pmd_shared(ptep))
flags |= PM_MMAP_EXCLUSIVE;
if (huge_pte_uffd_wp(pte))
@@ -1603,13 +2147,14 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
for (; addr != end; addr += PAGE_SIZE) {
pagemap_entry_t pme = make_pme(frame, flags);
- err = add_to_pagemap(addr, &pme, pm);
+ err = add_to_pagemap(&pme, pm);
if (err)
- return err;
+ break;
if (pm->show_pfn && (flags & PM_PRESENT))
frame++;
}
+ spin_unlock(ptl);
cond_resched();
return err;
@@ -1622,6 +2167,7 @@ static const struct mm_walk_ops pagemap_ops = {
.pmd_entry = pagemap_pmd_range,
.pte_hole = pagemap_pte_hole,
.hugetlb_entry = pagemap_hugetlb_range,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
@@ -1636,7 +2182,8 @@ static const struct mm_walk_ops pagemap_ops = {
* Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
* Bit 56 page exclusively mapped
* Bit 57 pte is uffd-wp write-protected
- * Bits 58-60 zero
+ * Bit 58 pte is a guard region
+ * Bits 59-60 zero
* Bit 61 page is file-page or shared-anon
* Bit 62 page swapped
* Bit 63 page present
@@ -1750,8 +2297,8 @@ static int pagemap_open(struct inode *inode, struct file *file)
struct mm_struct *mm;
mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(mm))
- return PTR_ERR(mm);
+ if (IS_ERR_OR_NULL(mm))
+ return mm ? PTR_ERR(mm) : -ESRCH;
file->private_data = mm;
return 0;
}
@@ -1765,11 +2312,794 @@ static int pagemap_release(struct inode *inode, struct file *file)
return 0;
}
+#define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \
+ PAGE_IS_FILE | PAGE_IS_PRESENT | \
+ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \
+ PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \
+ PAGE_IS_GUARD)
+#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
+
+struct pagemap_scan_private {
+ struct pm_scan_arg arg;
+ unsigned long masks_of_interest, cur_vma_category;
+ struct page_region *vec_buf;
+ unsigned long vec_buf_len, vec_buf_index, found_pages;
+ struct page_region __user *vec_out;
+};
+
+static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
+ struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte)
+{
+ unsigned long categories;
+
+ if (pte_none(pte))
+ return 0;
+
+ if (pte_present(pte)) {
+ struct page *page;
+
+ categories = PAGE_IS_PRESENT;
+
+ if (!pte_uffd_wp(pte))
+ categories |= PAGE_IS_WRITTEN;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ page = vm_normal_page(vma, addr, pte);
+ if (page && !PageAnon(page))
+ categories |= PAGE_IS_FILE;
+ }
+
+ if (is_zero_pfn(pte_pfn(pte)))
+ categories |= PAGE_IS_PFNZERO;
+ if (pte_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ } else {
+ softleaf_t entry;
+
+ categories = PAGE_IS_SWAPPED;
+
+ if (!pte_swp_uffd_wp_any(pte))
+ categories |= PAGE_IS_WRITTEN;
+
+ entry = softleaf_from_pte(pte);
+ if (softleaf_is_guard_marker(entry))
+ categories |= PAGE_IS_GUARD;
+ else if ((p->masks_of_interest & PAGE_IS_FILE) &&
+ softleaf_has_pfn(entry) &&
+ !folio_test_anon(softleaf_to_folio(entry)))
+ categories |= PAGE_IS_FILE;
+
+ if (pte_swp_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ }
+
+ return categories;
+}
+
+static void make_uffd_wp_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte, pte_t ptent)
+{
+ if (pte_present(ptent)) {
+ pte_t old_pte;
+
+ old_pte = ptep_modify_prot_start(vma, addr, pte);
+ ptent = pte_mkuffd_wp(old_pte);
+ ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
+ } else if (pte_none(ptent)) {
+ set_pte_at(vma->vm_mm, addr, pte,
+ make_pte_marker(PTE_MARKER_UFFD_WP));
+ } else {
+ ptent = pte_swp_mkuffd_wp(ptent);
+ set_pte_at(vma->vm_mm, addr, pte, ptent);
+ }
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
+ struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd)
+{
+ unsigned long categories = PAGE_IS_HUGE;
+
+ if (pmd_none(pmd))
+ return categories;
+
+ if (pmd_present(pmd)) {
+ struct page *page;
+
+ categories |= PAGE_IS_PRESENT;
+ if (!pmd_uffd_wp(pmd))
+ categories |= PAGE_IS_WRITTEN;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ page = vm_normal_page_pmd(vma, addr, pmd);
+ if (page && !PageAnon(page))
+ categories |= PAGE_IS_FILE;
+ }
+
+ if (is_huge_zero_pmd(pmd))
+ categories |= PAGE_IS_PFNZERO;
+ if (pmd_soft_dirty(pmd))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ } else {
+ categories |= PAGE_IS_SWAPPED;
+ if (!pmd_swp_uffd_wp(pmd))
+ categories |= PAGE_IS_WRITTEN;
+ if (pmd_swp_soft_dirty(pmd))
+ categories |= PAGE_IS_SOFT_DIRTY;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ const softleaf_t entry = softleaf_from_pmd(pmd);
+
+ if (softleaf_has_pfn(entry) &&
+ !folio_test_anon(softleaf_to_folio(entry)))
+ categories |= PAGE_IS_FILE;
+ }
+ }
+
+ return categories;
+}
+
+static void make_uffd_wp_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t old, pmd = *pmdp;
+
+ if (pmd_present(pmd)) {
+ old = pmdp_invalidate_ad(vma, addr, pmdp);
+ pmd = pmd_mkuffd_wp(old);
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ } else if (pmd_is_migration_entry(pmd)) {
+ pmd = pmd_swp_mkuffd_wp(pmd);
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long pagemap_hugetlb_category(pte_t pte)
+{
+ unsigned long categories = PAGE_IS_HUGE;
+
+ if (pte_none(pte))
+ return categories;
+
+ /*
+ * According to pagemap_hugetlb_range(), file-backed HugeTLB
+ * page cannot be swapped. So PAGE_IS_FILE is not checked for
+ * swapped pages.
+ */
+ if (pte_present(pte)) {
+ categories |= PAGE_IS_PRESENT;
+
+ if (!huge_pte_uffd_wp(pte))
+ categories |= PAGE_IS_WRITTEN;
+ if (!PageAnon(pte_page(pte)))
+ categories |= PAGE_IS_FILE;
+ if (is_zero_pfn(pte_pfn(pte)))
+ categories |= PAGE_IS_PFNZERO;
+ if (pte_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ } else {
+ categories |= PAGE_IS_SWAPPED;
+
+ if (!pte_swp_uffd_wp_any(pte))
+ categories |= PAGE_IS_WRITTEN;
+ if (pte_swp_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ }
+
+ return categories;
+}
+
+static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t ptent)
+{
+ const unsigned long psize = huge_page_size(hstate_vma(vma));
+ softleaf_t entry;
+
+ if (huge_pte_none(ptent)) {
+ set_huge_pte_at(vma->vm_mm, addr, ptep,
+ make_pte_marker(PTE_MARKER_UFFD_WP), psize);
+ return;
+ }
+
+ entry = softleaf_from_pte(ptent);
+ if (softleaf_is_hwpoison(entry) || softleaf_is_marker(entry))
+ return;
+
+ if (softleaf_is_migration(entry))
+ set_huge_pte_at(vma->vm_mm, addr, ptep,
+ pte_swp_mkuffd_wp(ptent), psize);
+ else
+ huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
+ huge_pte_mkuffd_wp(ptent));
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
+static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
+ unsigned long addr, unsigned long end)
+{
+ struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+
+ if (!p->vec_buf)
+ return;
+
+ if (cur_buf->start != addr)
+ cur_buf->end = addr;
+ else
+ cur_buf->start = cur_buf->end = 0;
+
+ p->found_pages -= (end - addr) / PAGE_SIZE;
+}
+#endif
+
+static bool pagemap_scan_is_interesting_page(unsigned long categories,
+ const struct pagemap_scan_private *p)
+{
+ categories ^= p->arg.category_inverted;
+ if ((categories & p->arg.category_mask) != p->arg.category_mask)
+ return false;
+ if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask))
+ return false;
+
+ return true;
+}
+
+static bool pagemap_scan_is_interesting_vma(unsigned long categories,
+ const struct pagemap_scan_private *p)
+{
+ unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED;
+
+ categories ^= p->arg.category_inverted;
+ if ((categories & required) != required)
+ return false;
+
+ return true;
+}
+
+static int pagemap_scan_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long vma_category = 0;
+ bool wp_allowed = userfaultfd_wp_async(vma) &&
+ userfaultfd_wp_use_markers(vma);
+
+ if (!wp_allowed) {
+ /* User requested explicit failure over wp-async capability */
+ if (p->arg.flags & PM_SCAN_CHECK_WPASYNC)
+ return -EPERM;
+ /*
+ * User requires wr-protect, and allows silently skipping
+ * unsupported vmas.
+ */
+ if (p->arg.flags & PM_SCAN_WP_MATCHING)
+ return 1;
+ /*
+ * Then the request doesn't involve wr-protects at all,
+ * fall through to the rest checks, and allow vma walk.
+ */
+ }
+
+ if (vma->vm_flags & VM_PFNMAP)
+ return 1;
+
+ if (wp_allowed)
+ vma_category |= PAGE_IS_WPALLOWED;
+
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ vma_category |= PAGE_IS_SOFT_DIRTY;
+
+ if (!pagemap_scan_is_interesting_vma(vma_category, p))
+ return 1;
+
+ p->cur_vma_category = vma_category;
+
+ return 0;
+}
+
+static bool pagemap_scan_push_range(unsigned long categories,
+ struct pagemap_scan_private *p,
+ unsigned long addr, unsigned long end)
+{
+ struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+
+ /*
+ * When there is no output buffer provided at all, the sentinel values
+ * won't match here. There is no other way for `cur_buf->end` to be
+ * non-zero other than it being non-empty.
+ */
+ if (addr == cur_buf->end && categories == cur_buf->categories) {
+ cur_buf->end = end;
+ return true;
+ }
+
+ if (cur_buf->end) {
+ if (p->vec_buf_index >= p->vec_buf_len - 1)
+ return false;
+
+ cur_buf = &p->vec_buf[++p->vec_buf_index];
+ }
+
+ cur_buf->start = addr;
+ cur_buf->end = end;
+ cur_buf->categories = categories;
+
+ return true;
+}
+
+static int pagemap_scan_output(unsigned long categories,
+ struct pagemap_scan_private *p,
+ unsigned long addr, unsigned long *end)
+{
+ unsigned long n_pages, total_pages;
+ int ret = 0;
+
+ if (!p->vec_buf)
+ return 0;
+
+ categories &= p->arg.return_mask;
+
+ n_pages = (*end - addr) / PAGE_SIZE;
+ if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
+ total_pages > p->arg.max_pages) {
+ size_t n_too_much = total_pages - p->arg.max_pages;
+ *end -= n_too_much * PAGE_SIZE;
+ n_pages -= n_too_much;
+ ret = -ENOSPC;
+ }
+
+ if (!pagemap_scan_push_range(categories, p, addr, *end)) {
+ *end = addr;
+ n_pages = 0;
+ ret = -ENOSPC;
+ }
+
+ p->found_pages += n_pages;
+ if (ret)
+ p->arg.walk_end = *end;
+
+ return ret;
+}
+
+static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
+ unsigned long end, struct mm_walk *walk)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long categories;
+ spinlock_t *ptl;
+ int ret = 0;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return -ENOENT;
+
+ categories = p->cur_vma_category |
+ pagemap_thp_category(p, vma, start, *pmd);
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ goto out_unlock;
+
+ ret = pagemap_scan_output(categories, p, start, &end);
+ if (start == end)
+ goto out_unlock;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ goto out_unlock;
+ if (~categories & PAGE_IS_WRITTEN)
+ goto out_unlock;
+
+ /*
+ * Break huge page into small pages if the WP operation
+ * needs to be performed on a portion of the huge page.
+ */
+ if (end != start + HPAGE_SIZE) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, start);
+ pagemap_scan_backout_range(p, start, end);
+ /* Report as if there was no THP */
+ return -ENOENT;
+ }
+
+ make_uffd_wp_pmd(vma, start, pmd);
+ flush_tlb_range(vma, start, end);
+out_unlock:
+ spin_unlock(ptl);
+ return ret;
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+ return -ENOENT;
+#endif
+}
+
+static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long addr, flush_end = 0;
+ pte_t *pte, *start_pte;
+ spinlock_t *ptl;
+ int ret;
+
+ ret = pagemap_scan_thp_entry(pmd, start, end, walk);
+ if (ret != -ENOENT)
+ return ret;
+
+ ret = 0;
+ start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+
+ arch_enter_lazy_mmu_mode();
+
+ if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
+ /* Fast path for performing exclusive WP */
+ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+ pte_t ptent = ptep_get(pte);
+
+ if ((pte_present(ptent) && pte_uffd_wp(ptent)) ||
+ pte_swp_uffd_wp_any(ptent))
+ continue;
+ make_uffd_wp_pte(vma, addr, pte, ptent);
+ if (!flush_end)
+ start = addr;
+ flush_end = addr + PAGE_SIZE;
+ }
+ goto flush_and_return;
+ }
+
+ if (!p->arg.category_anyof_mask && !p->arg.category_inverted &&
+ p->arg.category_mask == PAGE_IS_WRITTEN &&
+ p->arg.return_mask == PAGE_IS_WRITTEN) {
+ for (addr = start; addr < end; pte++, addr += PAGE_SIZE) {
+ unsigned long next = addr + PAGE_SIZE;
+ pte_t ptent = ptep_get(pte);
+
+ if ((pte_present(ptent) && pte_uffd_wp(ptent)) ||
+ pte_swp_uffd_wp_any(ptent))
+ continue;
+ ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN,
+ p, addr, &next);
+ if (next == addr)
+ break;
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ continue;
+ make_uffd_wp_pte(vma, addr, pte, ptent);
+ if (!flush_end)
+ start = addr;
+ flush_end = next;
+ }
+ goto flush_and_return;
+ }
+
+ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+ pte_t ptent = ptep_get(pte);
+ unsigned long categories = p->cur_vma_category |
+ pagemap_page_category(p, vma, addr, ptent);
+ unsigned long next = addr + PAGE_SIZE;
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ continue;
+
+ ret = pagemap_scan_output(categories, p, addr, &next);
+ if (next == addr)
+ break;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ continue;
+ if (~categories & PAGE_IS_WRITTEN)
+ continue;
+
+ make_uffd_wp_pte(vma, addr, pte, ptent);
+ if (!flush_end)
+ start = addr;
+ flush_end = next;
+ }
+
+flush_and_return:
+ if (flush_end)
+ flush_tlb_range(vma, start, addr);
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+
+ cond_resched();
+ return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
+ unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long categories;
+ spinlock_t *ptl;
+ int ret = 0;
+ pte_t pte;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
+ /* Go the short route when not write-protecting pages. */
+
+ pte = huge_ptep_get(walk->mm, start, ptep);
+ categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ return 0;
+
+ return pagemap_scan_output(categories, p, start, &end);
+ }
+
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
+
+ pte = huge_ptep_get(walk->mm, start, ptep);
+ categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ goto out_unlock;
+
+ ret = pagemap_scan_output(categories, p, start, &end);
+ if (start == end)
+ goto out_unlock;
+
+ if (~categories & PAGE_IS_WRITTEN)
+ goto out_unlock;
+
+ if (end != start + HPAGE_SIZE) {
+ /* Partial HugeTLB page WP isn't possible. */
+ pagemap_scan_backout_range(p, start, end);
+ p->arg.walk_end = start;
+ ret = 0;
+ goto out_unlock;
+ }
+
+ make_uffd_wp_huge_pte(vma, start, ptep, pte);
+ flush_hugetlb_tlb_range(vma, start, end);
+
+out_unlock:
+ spin_unlock(ptl);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+
+ return ret;
+}
+#else
+#define pagemap_scan_hugetlb_entry NULL
+#endif
+
+static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
+ int depth, struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ int ret, err;
+
+ if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p))
+ return 0;
+
+ ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end);
+ if (addr == end)
+ return ret;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ return ret;
+
+ err = uffd_wp_range(vma, addr, end - addr, true);
+ if (err < 0)
+ ret = err;
+
+ return ret;
+}
+
+static const struct mm_walk_ops pagemap_scan_ops = {
+ .test_walk = pagemap_scan_test_walk,
+ .pmd_entry = pagemap_scan_pmd_entry,
+ .pte_hole = pagemap_scan_pte_hole,
+ .hugetlb_entry = pagemap_scan_hugetlb_entry,
+};
+
+static int pagemap_scan_get_args(struct pm_scan_arg *arg,
+ unsigned long uarg)
+{
+ if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg)))
+ return -EFAULT;
+
+ if (arg->size != sizeof(struct pm_scan_arg))
+ return -EINVAL;
+
+ /* Validate requested features */
+ if (arg->flags & ~PM_SCAN_FLAGS)
+ return -EINVAL;
+ if ((arg->category_inverted | arg->category_mask |
+ arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES)
+ return -EINVAL;
+
+ arg->start = untagged_addr((unsigned long)arg->start);
+ arg->end = untagged_addr((unsigned long)arg->end);
+ arg->vec = untagged_addr((unsigned long)arg->vec);
+
+ /* Validate memory pointers */
+ if (!IS_ALIGNED(arg->start, PAGE_SIZE))
+ return -EINVAL;
+ if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start))
+ return -EFAULT;
+ if (!arg->vec && arg->vec_len)
+ return -EINVAL;
+ if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX)
+ return -EINVAL;
+ if (arg->vec && !access_ok((void __user *)(long)arg->vec,
+ size_mul(arg->vec_len, sizeof(struct page_region))))
+ return -EFAULT;
+
+ /* Fixup default values */
+ arg->end = ALIGN(arg->end, PAGE_SIZE);
+ arg->walk_end = 0;
+ if (!arg->max_pages)
+ arg->max_pages = ULONG_MAX;
+
+ return 0;
+}
+
+static int pagemap_scan_writeback_args(struct pm_scan_arg *arg,
+ unsigned long uargl)
+{
+ struct pm_scan_arg __user *uarg = (void __user *)uargl;
+
+ if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
+{
+ if (!p->arg.vec_len)
+ return 0;
+
+ p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
+ p->arg.vec_len);
+ p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
+ GFP_KERNEL);
+ if (!p->vec_buf)
+ return -ENOMEM;
+
+ p->vec_buf->start = p->vec_buf->end = 0;
+ p->vec_out = (struct page_region __user *)(long)p->arg.vec;
+
+ return 0;
+}
+
+static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
+{
+ const struct page_region *buf = p->vec_buf;
+ long n = p->vec_buf_index;
+
+ if (!p->vec_buf)
+ return 0;
+
+ if (buf[n].end != buf[n].start)
+ n++;
+
+ if (!n)
+ return 0;
+
+ if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
+ return -EFAULT;
+
+ p->arg.vec_len -= n;
+ p->vec_out += n;
+
+ p->vec_buf_index = 0;
+ p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len);
+ p->vec_buf->start = p->vec_buf->end = 0;
+
+ return n;
+}
+
+static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
+{
+ struct pagemap_scan_private p = {0};
+ unsigned long walk_start;
+ size_t n_ranges_out = 0;
+ int ret;
+
+ ret = pagemap_scan_get_args(&p.arg, uarg);
+ if (ret)
+ return ret;
+
+ p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask |
+ p.arg.return_mask;
+ ret = pagemap_scan_init_bounce_buffer(&p);
+ if (ret)
+ return ret;
+
+ for (walk_start = p.arg.start; walk_start < p.arg.end;
+ walk_start = p.arg.walk_end) {
+ struct mmu_notifier_range range;
+ long n_out;
+
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ break;
+
+ /* Protection change for the range is going to happen. */
+ if (p.arg.flags & PM_SCAN_WP_MATCHING) {
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
+ mm, walk_start, p.arg.end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+
+ ret = walk_page_range(mm, walk_start, p.arg.end,
+ &pagemap_scan_ops, &p);
+
+ if (p.arg.flags & PM_SCAN_WP_MATCHING)
+ mmu_notifier_invalidate_range_end(&range);
+
+ mmap_read_unlock(mm);
+
+ n_out = pagemap_scan_flush_buffer(&p);
+ if (n_out < 0)
+ ret = n_out;
+ else
+ n_ranges_out += n_out;
+
+ if (ret != -ENOSPC)
+ break;
+
+ if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages)
+ break;
+ }
+
+ /* ENOSPC signifies early stop (buffer full) from the walk. */
+ if (!ret || ret == -ENOSPC)
+ ret = n_ranges_out;
+
+ /* The walk_end isn't set when ret is zero */
+ if (!p.arg.walk_end)
+ p.arg.walk_end = p.arg.end;
+ if (pagemap_scan_writeback_args(&p.arg, uarg))
+ ret = -EFAULT;
+
+ kfree(p.vec_buf);
+ return ret;
+}
+
+static long do_pagemap_cmd(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mm_struct *mm = file->private_data;
+
+ switch (cmd) {
+ case PAGEMAP_SCAN:
+ return do_pagemap_scan(mm, arg);
+
+ default:
+ return -EINVAL;
+ }
+}
+
const struct file_operations proc_pagemap_operations = {
.llseek = mem_lseek, /* borrow this */
.read = pagemap_read,
.open = pagemap_open,
.release = pagemap_release,
+ .unlocked_ioctl = do_pagemap_cmd,
+ .compat_ioctl = do_pagemap_cmd,
};
#endif /* CONFIG_PROC_PAGE_MONITOR */
@@ -1794,28 +3124,34 @@ struct numa_maps_private {
static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
unsigned long nr_pages)
{
- int count = page_mapcount(page);
+ struct folio *folio = page_folio(page);
+ int count;
+
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ count = folio_precise_page_mapcount(folio, page);
+ else
+ count = folio_average_page_mapcount(folio);
md->pages += nr_pages;
- if (pte_dirty || PageDirty(page))
+ if (pte_dirty || folio_test_dirty(folio))
md->dirty += nr_pages;
- if (PageSwapCache(page))
+ if (folio_test_swapcache(folio))
md->swapcache += nr_pages;
- if (PageActive(page) || PageUnevictable(page))
+ if (folio_test_active(folio) || folio_test_unevictable(folio))
md->active += nr_pages;
- if (PageWriteback(page))
+ if (folio_test_writeback(folio))
md->writeback += nr_pages;
- if (PageAnon(page))
+ if (folio_test_anon(folio))
md->anon += nr_pages;
if (count > md->mapcount_max)
md->mapcount_max = count;
- md->node[page_to_nid(page)] += nr_pages;
+ md->node[folio_nid(folio)] += nr_pages;
}
static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
@@ -1910,17 +3246,22 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end, struct mm_walk *walk)
{
- pte_t huge_pte = huge_ptep_get(pte);
+ pte_t huge_pte;
struct numa_maps *md;
struct page *page;
+ spinlock_t *ptl;
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+ huge_pte = huge_ptep_get(walk->mm, addr, pte);
if (!pte_present(huge_pte))
- return 0;
+ goto out;
page = pte_page(huge_pte);
md = walk->private;
gather_stats(page, md, pte_dirty(huge_pte), 1);
+out:
+ spin_unlock(ptl);
return 0;
}
@@ -1935,6 +3276,7 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
static const struct mm_walk_ops show_numa_ops = {
.hugetlb_entry = gather_hugetlb_stats,
.pmd_entry = gather_pte_stats,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
@@ -1948,8 +3290,9 @@ static int show_numa_map(struct seq_file *m, void *v)
struct numa_maps *md = &numa_priv->md;
struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
- struct mempolicy *pol;
char buffer[64];
+ struct mempolicy *pol;
+ pgoff_t ilx;
int nid;
if (!mm)
@@ -1958,7 +3301,7 @@ static int show_numa_map(struct seq_file *m, void *v)
/* Ensure we start with an empty set of numa_maps statistics. */
memset(md, 0, sizeof(*md));
- pol = __get_vma_policy(vma, vma->vm_start);
+ pol = __get_vma_policy(vma, vma->vm_start, &ilx);
if (pol) {
mpol_to_str(buffer, sizeof(buffer), pol);
mpol_cond_put(pol);
@@ -1970,10 +3313,10 @@ static int show_numa_map(struct seq_file *m, void *v)
if (file) {
seq_puts(m, " file=");
- seq_file_path(m, file, "\n\t= ");
- } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+ seq_path(m, file_user_path(file), "\n\t= ");
+ } else if (vma_is_initial_heap(vma)) {
seq_puts(m, " heap");
- } else if (is_stack(vma)) {
+ } else if (vma_is_initial_stack(vma)) {
seq_puts(m, " stack");
}
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 2c8b62265981..d362919f4f68 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -121,19 +121,6 @@ unsigned long task_statm(struct mm_struct *mm,
return size;
}
-static int is_stack(struct vm_area_struct *vma)
-{
- struct mm_struct *mm = vma->vm_mm;
-
- /*
- * We make no effort to guess what a given thread considers to be
- * its "stack". It's not even well-defined for programs written
- * languages like Go.
- */
- return vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack;
-}
-
/*
* display a single VMA to a sequenced file
*/
@@ -170,8 +157,8 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
if (file) {
seq_pad(m, ' ');
- seq_file_path(m, file, "");
- } else if (mm && is_stack(vma)) {
+ seq_path(m, file_user_path(file), "");
+ } else if (mm && vma_is_initial_stack(vma)) {
seq_pad(m, ' ');
seq_puts(m, "[stack]");
}
@@ -188,15 +175,28 @@ static int show_map(struct seq_file *m, void *_p)
return nommu_vma_show(m, _p);
}
-static void *m_start(struct seq_file *m, loff_t *pos)
+static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
+ loff_t *ppos)
+{
+ struct vm_area_struct *vma = vma_next(&priv->iter);
+
+ if (vma) {
+ *ppos = vma->vm_start;
+ } else {
+ *ppos = -1UL;
+ }
+
+ return vma;
+}
+
+static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
+ unsigned long last_addr = *ppos;
struct mm_struct *mm;
- struct vm_area_struct *vma;
- unsigned long addr = *pos;
- /* See m_next(). Zero at the start or after lseek. */
- if (addr == -1UL)
+ /* See proc_get_vma(). Zero at the start or after lseek. */
+ if (last_addr == -1UL)
return NULL;
/* pin the task and mm whilst we play with them */
@@ -204,45 +204,42 @@ static void *m_start(struct seq_file *m, loff_t *pos)
if (!priv->task)
return ERR_PTR(-ESRCH);
- mm = priv->mm;
- if (!mm || !mmget_not_zero(mm))
+ mm = priv->lock_ctx.mm;
+ if (!mm || !mmget_not_zero(mm)) {
+ put_task_struct(priv->task);
+ priv->task = NULL;
return NULL;
+ }
if (mmap_read_lock_killable(mm)) {
mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
return ERR_PTR(-EINTR);
}
- /* start the next element from addr */
- vma = find_vma(mm, addr);
- if (vma)
- return vma;
+ vma_iter_init(&priv->iter, mm, last_addr);
- mmap_read_unlock(mm);
- mmput(mm);
- return NULL;
+ return proc_get_vma(priv, ppos);
}
-static void m_stop(struct seq_file *m, void *_vml)
+static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
+ struct mm_struct *mm = priv->lock_ctx.mm;
- if (!IS_ERR_OR_NULL(_vml)) {
- mmap_read_unlock(priv->mm);
- mmput(priv->mm);
- }
- if (priv->task) {
- put_task_struct(priv->task);
- priv->task = NULL;
- }
+ if (!priv->task)
+ return;
+
+ mmap_read_unlock(mm);
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
}
-static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
+static void *m_next(struct seq_file *m, void *_p, loff_t *ppos)
{
- struct vm_area_struct *vma = _p;
-
- *pos = vma->vm_end;
- return find_vma(vma->vm_mm, vma->vm_end);
+ return proc_get_vma(m->private, ppos);
}
static const struct seq_operations proc_pid_maps_ops = {
@@ -262,9 +259,9 @@ static int maps_open(struct inode *inode, struct file *file,
return -ENOMEM;
priv->inode = inode;
- priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(priv->mm)) {
- int err = PTR_ERR(priv->mm);
+ priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) {
+ int err = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH;
seq_release_private(inode, file);
return err;
@@ -279,8 +276,8 @@ static int map_release(struct inode *inode, struct file *file)
struct seq_file *seq = file->private_data;
struct proc_maps_private *priv = seq->private;
- if (priv->mm)
- mmdrop(priv->mm);
+ if (priv->lock_ctx.mm)
+ mmdrop(priv->lock_ctx.mm);
return seq_release_private(inode, file);
}
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index a553273fbd41..d6113dbe58e0 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -31,12 +31,11 @@ static const struct inode_operations proc_thread_self_inode_operations = {
.get_link = proc_thread_self_get_link,
};
-static unsigned thread_self_inum __ro_after_init;
+unsigned thread_self_inum __ro_after_init;
int proc_setup_thread_self(struct super_block *s)
{
struct inode *root_inode = d_inode(s->s_root);
- struct proc_fs_info *fs_info = proc_sb_info(s);
struct dentry *thread_self;
int ret = -ENOMEM;
@@ -46,24 +45,20 @@ int proc_setup_thread_self(struct super_block *s)
struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = thread_self_inum;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ simple_inode_init_ts(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_op = &proc_thread_self_inode_operations;
- d_add(thread_self, inode);
+ d_make_persistent(thread_self, inode);
ret = 0;
- } else {
- dput(thread_self);
}
+ dput(thread_self);
}
inode_unlock(root_inode);
if (ret)
pr_err("proc_fill_super: can't allocate /proc/thread-self\n");
- else
- fs_info->proc_thread_self = thread_self;
-
return ret;
}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 1fb213f379a5..f188bd900eb2 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -8,6 +8,8 @@
*
*/
+#define pr_fmt(fmt) "vmcore: " fmt
+
#include <linux/mm.h>
#include <linux/kcore.h>
#include <linux/user.h>
@@ -51,9 +53,14 @@ static u64 vmcore_size;
static struct proc_dir_entry *proc_vmcore;
#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+struct vmcoredd_node {
+ struct list_head list; /* List of dumps */
+ void *buf; /* Buffer containing device's dump */
+ unsigned int size; /* Size of the buffer */
+};
+
/* Device Dump list and mutex to synchronize access to list */
static LIST_HEAD(vmcoredd_list);
-static DEFINE_MUTEX(vmcoredd_mutex);
static bool vmcoredd_disabled;
core_param(novmcoredd, vmcoredd_disabled, bool, 0);
@@ -62,17 +69,22 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0);
/* Device Dump Size */
static size_t vmcoredd_orig_sz;
-static DEFINE_SPINLOCK(vmcore_cb_lock);
+static DEFINE_MUTEX(vmcore_mutex);
+
DEFINE_STATIC_SRCU(vmcore_cb_srcu);
/* List of registered vmcore callbacks. */
static LIST_HEAD(vmcore_cb_list);
/* Whether the vmcore has been opened once. */
static bool vmcore_opened;
+/* Whether the vmcore is currently open. */
+static unsigned int vmcore_open;
+
+static void vmcore_process_device_ram(struct vmcore_cb *cb);
void register_vmcore_cb(struct vmcore_cb *cb)
{
INIT_LIST_HEAD(&cb->next);
- spin_lock(&vmcore_cb_lock);
+ mutex_lock(&vmcore_mutex);
list_add_tail(&cb->next, &vmcore_cb_list);
/*
* Registering a vmcore callback after the vmcore was opened is
@@ -80,13 +92,15 @@ void register_vmcore_cb(struct vmcore_cb *cb)
*/
if (vmcore_opened)
pr_warn_once("Unexpected vmcore callback registration\n");
- spin_unlock(&vmcore_cb_lock);
+ if (!vmcore_open && cb->get_device_ram)
+ vmcore_process_device_ram(cb);
+ mutex_unlock(&vmcore_mutex);
}
EXPORT_SYMBOL_GPL(register_vmcore_cb);
void unregister_vmcore_cb(struct vmcore_cb *cb)
{
- spin_lock(&vmcore_cb_lock);
+ mutex_lock(&vmcore_mutex);
list_del_rcu(&cb->next);
/*
* Unregistering a vmcore callback after the vmcore was opened is
@@ -95,7 +109,7 @@ void unregister_vmcore_cb(struct vmcore_cb *cb)
*/
if (vmcore_opened)
pr_warn_once("Unexpected vmcore callback unregistration\n");
- spin_unlock(&vmcore_cb_lock);
+ mutex_unlock(&vmcore_mutex);
synchronize_srcu(&vmcore_cb_srcu);
}
@@ -120,9 +134,23 @@ static bool pfn_is_ram(unsigned long pfn)
static int open_vmcore(struct inode *inode, struct file *file)
{
- spin_lock(&vmcore_cb_lock);
+ mutex_lock(&vmcore_mutex);
vmcore_opened = true;
- spin_unlock(&vmcore_cb_lock);
+ if (vmcore_open + 1 == 0) {
+ mutex_unlock(&vmcore_mutex);
+ return -EBUSY;
+ }
+ vmcore_open++;
+ mutex_unlock(&vmcore_mutex);
+
+ return 0;
+}
+
+static int release_vmcore(struct inode *inode, struct file *file)
+{
+ mutex_lock(&vmcore_mutex);
+ vmcore_open--;
+ mutex_unlock(&vmcore_mutex);
return 0;
}
@@ -243,33 +271,27 @@ static int vmcoredd_copy_dumps(struct iov_iter *iter, u64 start, size_t size)
{
struct vmcoredd_node *dump;
u64 offset = 0;
- int ret = 0;
size_t tsz;
char *buf;
- mutex_lock(&vmcoredd_mutex);
list_for_each_entry(dump, &vmcoredd_list, list) {
if (start < offset + dump->size) {
tsz = min(offset + (u64)dump->size - start, (u64)size);
buf = dump->buf + start - offset;
- if (copy_to_iter(buf, tsz, iter) < tsz) {
- ret = -EFAULT;
- goto out_unlock;
- }
+ if (copy_to_iter(buf, tsz, iter) < tsz)
+ return -EFAULT;
size -= tsz;
start += tsz;
/* Leave now if buffer filled already */
if (!size)
- goto out_unlock;
+ return 0;
}
offset += dump->size;
}
-out_unlock:
- mutex_unlock(&vmcoredd_mutex);
- return ret;
+ return 0;
}
#ifdef CONFIG_MMU
@@ -278,20 +300,16 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst,
{
struct vmcoredd_node *dump;
u64 offset = 0;
- int ret = 0;
size_t tsz;
char *buf;
- mutex_lock(&vmcoredd_mutex);
list_for_each_entry(dump, &vmcoredd_list, list) {
if (start < offset + dump->size) {
tsz = min(offset + (u64)dump->size - start, (u64)size);
buf = dump->buf + start - offset;
if (remap_vmalloc_range_partial(vma, dst, buf, 0,
- tsz)) {
- ret = -EFAULT;
- goto out_unlock;
- }
+ tsz))
+ return -EFAULT;
size -= tsz;
start += tsz;
@@ -299,14 +317,12 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst,
/* Leave now if buffer filled already */
if (!size)
- goto out_unlock;
+ return 0;
}
offset += dump->size;
}
-out_unlock:
- mutex_unlock(&vmcoredd_mutex);
- return ret;
+ return 0;
}
#endif /* CONFIG_MMU */
#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
@@ -316,10 +332,10 @@ out_unlock:
*/
static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos)
{
+ struct vmcore_range *m = NULL;
ssize_t acc = 0, tmp;
size_t tsz;
u64 start;
- struct vmcore *m = NULL;
if (!iov_iter_count(iter) || *fpos >= vmcore_size)
return 0;
@@ -383,6 +399,8 @@ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos)
/* leave now if filled buffer already */
if (!iov_iter_count(iter))
return acc;
+
+ cond_resched();
}
list_for_each_entry(m, &vmcore_list, list) {
@@ -402,6 +420,8 @@ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos)
if (!iov_iter_count(iter))
return acc;
}
+
+ cond_resched();
}
return acc;
@@ -412,6 +432,34 @@ static ssize_t read_vmcore(struct kiocb *iocb, struct iov_iter *iter)
return __read_vmcore(iter, &iocb->ki_pos);
}
+/**
+ * vmcore_alloc_buf - allocate buffer in vmalloc memory
+ * @size: size of buffer
+ *
+ * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
+ * the buffer to user-space by means of remap_vmalloc_range().
+ *
+ * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
+ * disabled and there's no need to allow users to mmap the buffer.
+ */
+static inline char *vmcore_alloc_buf(size_t size)
+{
+#ifdef CONFIG_MMU
+ return vmalloc_user(size);
+#else
+ return vzalloc(size);
+#endif
+}
+
+/*
+ * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is
+ * essential for mmap_vmcore() in order to map physically
+ * non-contiguous objects (ELF header, ELF note segment and memory
+ * regions in the 1st kernel pointed to by PT_LOAD entries) into
+ * virtually contiguous user-space in ELF layout.
+ */
+#ifdef CONFIG_MMU
+
/*
* The vmcore fault handler uses the page cache and fills data using the
* standard __read_vmcore() function.
@@ -459,33 +507,6 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
.fault = mmap_vmcore_fault,
};
-/**
- * vmcore_alloc_buf - allocate buffer in vmalloc memory
- * @size: size of buffer
- *
- * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
- * the buffer to user-space by means of remap_vmalloc_range().
- *
- * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
- * disabled and there's no need to allow users to mmap the buffer.
- */
-static inline char *vmcore_alloc_buf(size_t size)
-{
-#ifdef CONFIG_MMU
- return vmalloc_user(size);
-#else
- return vzalloc(size);
-#endif
-}
-
-/*
- * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is
- * essential for mmap_vmcore() in order to map physically
- * non-contiguous objects (ELF header, ELF note segment and memory
- * regions in the 1st kernel pointed to by PT_LOAD entries) into
- * virtually contiguous user-space in ELF layout.
- */
-#ifdef CONFIG_MMU
/*
* remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages
* reported as not being ram with the zero page.
@@ -571,7 +592,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
{
size_t size = vma->vm_end - vma->vm_start;
u64 start, end, len, tsz;
- struct vmcore *m;
+ struct vmcore_range *m;
start = (u64)vma->vm_pgoff << PAGE_SHIFT;
end = start + size;
@@ -688,21 +709,17 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
static const struct proc_ops vmcore_proc_ops = {
.proc_open = open_vmcore,
+ .proc_release = release_vmcore,
.proc_read_iter = read_vmcore,
.proc_lseek = default_llseek,
.proc_mmap = mmap_vmcore,
};
-static struct vmcore* __init get_new_element(void)
-{
- return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
-}
-
static u64 get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
struct list_head *vc_list)
{
+ struct vmcore_range *m;
u64 size;
- struct vmcore *m;
size = elfsz + elfnotesegsz;
list_for_each_entry(m, vc_list, list) {
@@ -1104,7 +1121,6 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
Elf64_Ehdr *ehdr_ptr;
Elf64_Phdr *phdr_ptr;
loff_t vmcore_off;
- struct vmcore *new;
ehdr_ptr = (Elf64_Ehdr *)elfptr;
phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
@@ -1123,13 +1139,8 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
size = end - start;
- /* Add this contiguous chunk of memory to vmcore list.*/
- new = get_new_element();
- if (!new)
+ if (vmcore_alloc_add_range(vc_list, start, size))
return -ENOMEM;
- new->paddr = start;
- new->size = size;
- list_add_tail(&new->list, vc_list);
/* Update the program header offset. */
phdr_ptr->p_offset = vmcore_off + (paddr - start);
@@ -1147,7 +1158,6 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
Elf32_Ehdr *ehdr_ptr;
Elf32_Phdr *phdr_ptr;
loff_t vmcore_off;
- struct vmcore *new;
ehdr_ptr = (Elf32_Ehdr *)elfptr;
phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
@@ -1166,13 +1176,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
size = end - start;
- /* Add this contiguous chunk of memory to vmcore list.*/
- new = get_new_element();
- if (!new)
+ if (vmcore_alloc_add_range(vc_list, start, size))
return -ENOMEM;
- new->paddr = start;
- new->size = size;
- list_add_tail(&new->list, vc_list);
/* Update the program header offset */
phdr_ptr->p_offset = vmcore_off + (paddr - start);
@@ -1185,8 +1190,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
struct list_head *vc_list)
{
+ struct vmcore_range *m;
loff_t vmcore_off;
- struct vmcore *m;
/* Skip ELF header, program headers and ELF note segment. */
vmcore_off = elfsz + elfnotes_sz;
@@ -1370,9 +1375,8 @@ static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data,
vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name);
vdd_hdr->n_type = NT_VMCOREDD;
- strncpy((char *)vdd_hdr->name, VMCOREDD_NOTE_NAME,
- sizeof(vdd_hdr->name));
- memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name));
+ strscpy_pad(vdd_hdr->name, VMCOREDD_NOTE_NAME);
+ strscpy_pad(vdd_hdr->dump_name, data->dump_name);
}
/**
@@ -1486,10 +1490,8 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
return -EINVAL;
dump = vzalloc(sizeof(*dump));
- if (!dump) {
- ret = -ENOMEM;
- goto out_err;
- }
+ if (!dump)
+ return -ENOMEM;
/* Keep size of the buffer page aligned so that it can be mmaped */
data_size = roundup(sizeof(struct vmcoredd_header) + data->size,
@@ -1514,12 +1516,18 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
dump->buf = buf;
dump->size = data_size;
- /* Add the dump to driver sysfs list */
- mutex_lock(&vmcoredd_mutex);
- list_add_tail(&dump->list, &vmcoredd_list);
- mutex_unlock(&vmcoredd_mutex);
+ /* Add the dump to driver sysfs list and update the elfcore hdr */
+ scoped_guard(mutex, &vmcore_mutex) {
+ if (vmcore_opened)
+ pr_warn_once("Unexpected adding of device dump\n");
+ if (vmcore_open) {
+ ret = -EBUSY;
+ goto out_err;
+ }
- vmcoredd_update_size(data_size);
+ list_add_tail(&dump->list, &vmcoredd_list);
+ vmcoredd_update_size(data_size);
+ }
return 0;
out_err:
@@ -1531,11 +1539,163 @@ out_err:
EXPORT_SYMBOL(vmcore_add_device_dump);
#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+#ifdef CONFIG_PROC_VMCORE_DEVICE_RAM
+static int vmcore_realloc_elfcore_buffer_elf64(size_t new_size)
+{
+ char *elfcorebuf_new;
+
+ if (WARN_ON_ONCE(new_size < elfcorebuf_sz))
+ return -EINVAL;
+ if (get_order(elfcorebuf_sz_orig) == get_order(new_size)) {
+ elfcorebuf_sz_orig = new_size;
+ return 0;
+ }
+
+ elfcorebuf_new = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(new_size));
+ if (!elfcorebuf_new)
+ return -ENOMEM;
+ memcpy(elfcorebuf_new, elfcorebuf, elfcorebuf_sz);
+ free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig));
+ elfcorebuf = elfcorebuf_new;
+ elfcorebuf_sz_orig = new_size;
+ return 0;
+}
+
+static void vmcore_reset_offsets_elf64(void)
+{
+ Elf64_Phdr *phdr_start = (Elf64_Phdr *)(elfcorebuf + sizeof(Elf64_Ehdr));
+ loff_t vmcore_off = elfcorebuf_sz + elfnotes_sz;
+ Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfcorebuf;
+ Elf64_Phdr *phdr;
+ int i;
+
+ for (i = 0, phdr = phdr_start; i < ehdr->e_phnum; i++, phdr++) {
+ u64 start, end;
+
+ /*
+ * After merge_note_headers_elf64() we should only have a single
+ * PT_NOTE entry that starts immediately after elfcorebuf_sz.
+ */
+ if (phdr->p_type == PT_NOTE) {
+ phdr->p_offset = elfcorebuf_sz;
+ continue;
+ }
+
+ start = rounddown(phdr->p_offset, PAGE_SIZE);
+ end = roundup(phdr->p_offset + phdr->p_memsz, PAGE_SIZE);
+ phdr->p_offset = vmcore_off + (phdr->p_offset - start);
+ vmcore_off = vmcore_off + end - start;
+ }
+ set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
+}
+
+static int vmcore_add_device_ram_elf64(struct list_head *list, size_t count)
+{
+ Elf64_Phdr *phdr_start = (Elf64_Phdr *)(elfcorebuf + sizeof(Elf64_Ehdr));
+ Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfcorebuf;
+ struct vmcore_range *cur;
+ Elf64_Phdr *phdr;
+ size_t new_size;
+ int rc;
+
+ if ((Elf32_Half)(ehdr->e_phnum + count) != ehdr->e_phnum + count) {
+ pr_err("too many device ram ranges\n");
+ return -ENOSPC;
+ }
+
+ /* elfcorebuf_sz must always cover full pages. */
+ new_size = sizeof(Elf64_Ehdr) +
+ (ehdr->e_phnum + count) * sizeof(Elf64_Phdr);
+ new_size = roundup(new_size, PAGE_SIZE);
+
+ /*
+ * Make sure we have sufficient space to include the new PT_LOAD
+ * entries.
+ */
+ rc = vmcore_realloc_elfcore_buffer_elf64(new_size);
+ if (rc) {
+ pr_err("resizing elfcore failed\n");
+ return rc;
+ }
+
+ /* Modify our used elfcore buffer size to cover the new entries. */
+ elfcorebuf_sz = new_size;
+
+ /* Fill the added PT_LOAD entries. */
+ phdr = phdr_start + ehdr->e_phnum;
+ list_for_each_entry(cur, list, list) {
+ WARN_ON_ONCE(!IS_ALIGNED(cur->paddr | cur->size, PAGE_SIZE));
+ elfcorehdr_fill_device_ram_ptload_elf64(phdr, cur->paddr, cur->size);
+
+ /* p_offset will be adjusted later. */
+ phdr++;
+ ehdr->e_phnum++;
+ }
+ list_splice_tail(list, &vmcore_list);
+
+ /* We changed elfcorebuf_sz and added new entries; reset all offsets. */
+ vmcore_reset_offsets_elf64();
+
+ /* Finally, recalculate the total vmcore size. */
+ vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
+ &vmcore_list);
+ proc_vmcore->size = vmcore_size;
+ return 0;
+}
+
+static void vmcore_process_device_ram(struct vmcore_cb *cb)
+{
+ unsigned char *e_ident = (unsigned char *)elfcorebuf;
+ struct vmcore_range *first, *m;
+ LIST_HEAD(list);
+ int count;
+
+ /* We only support Elf64 dumps for now. */
+ if (WARN_ON_ONCE(e_ident[EI_CLASS] != ELFCLASS64)) {
+ pr_err("device ram ranges only support Elf64\n");
+ return;
+ }
+
+ if (cb->get_device_ram(cb, &list)) {
+ pr_err("obtaining device ram ranges failed\n");
+ return;
+ }
+ count = list_count_nodes(&list);
+ if (!count)
+ return;
+
+ /*
+ * For some reason these ranges are already know? Might happen
+ * with unusual register->unregister->register sequences; we'll simply
+ * sanity check using the first range.
+ */
+ first = list_first_entry(&list, struct vmcore_range, list);
+ list_for_each_entry(m, &vmcore_list, list) {
+ unsigned long long m_end = m->paddr + m->size;
+ unsigned long long first_end = first->paddr + first->size;
+
+ if (first->paddr < m_end && m->paddr < first_end)
+ goto out_free;
+ }
+
+ /* If adding the mem nodes succeeds, they must not be freed. */
+ if (!vmcore_add_device_ram_elf64(&list, count))
+ return;
+out_free:
+ vmcore_free_ranges(&list);
+}
+#else /* !CONFIG_PROC_VMCORE_DEVICE_RAM */
+static void vmcore_process_device_ram(struct vmcore_cb *cb)
+{
+}
+#endif /* CONFIG_PROC_VMCORE_DEVICE_RAM */
+
/* Free all dumps in vmcore device dump list */
static void vmcore_free_device_dumps(void)
{
#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
- mutex_lock(&vmcoredd_mutex);
+ mutex_lock(&vmcore_mutex);
while (!list_empty(&vmcoredd_list)) {
struct vmcoredd_node *dump;
@@ -1545,7 +1705,7 @@ static void vmcore_free_device_dumps(void)
vfree(dump->buf);
vfree(dump);
}
- mutex_unlock(&vmcoredd_mutex);
+ mutex_unlock(&vmcore_mutex);
#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
}
@@ -1567,7 +1727,7 @@ static int __init vmcore_init(void)
rc = parse_crash_elf_headers();
if (rc) {
elfcorehdr_free(elfcorehdr_addr);
- pr_warn("Kdump: vmcore not initialized\n");
+ pr_warn("not initialized\n");
return rc;
}
elfcorehdr_free(elfcorehdr_addr);
@@ -1588,14 +1748,7 @@ void vmcore_cleanup(void)
proc_vmcore = NULL;
}
- /* clear the vmcore list. */
- while (!list_empty(&vmcore_list)) {
- struct vmcore *m;
-
- m = list_first_entry(&vmcore_list, struct vmcore, list);
- list_del(&m->list);
- kfree(m);
- }
+ vmcore_free_ranges(&vmcore_list);
free_elfcorebuf();
/* clear vmcore device dump list */