summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/trace/fprobe.rst16
-rw-r--r--Documentation/trace/ftrace.rst6
-rw-r--r--Documentation/trace/user_events.rst167
-rw-r--r--fs/exec.c2
-rw-r--r--include/linux/fprobe.h10
-rw-r--r--include/linux/ftrace.h5
-rw-r--r--include/linux/sched.h5
-rw-r--r--include/linux/seq_buf.h2
-rw-r--r--include/linux/user_events.h101
-rw-r--r--include/uapi/linux/user_events.h81
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/trace/Kconfig6
-rw-r--r--kernel/trace/bpf_trace.c17
-rw-r--r--kernel/trace/fprobe.c32
-rw-r--r--kernel/trace/ftrace.c51
-rw-r--r--kernel/trace/ring_buffer.c102
-rw-r--r--kernel/trace/trace.c7
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_events_user.c1034
-rw-r--r--kernel/trace/trace_output.c175
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--lib/seq_buf.c32
-rw-r--r--lib/test_fprobe.c105
-rw-r--r--samples/fprobe/fprobe_example.c7
-rw-r--r--samples/user_events/example.c45
-rwxr-xr-xscripts/leaking_addresses.pl1
-rw-r--r--scripts/recordmcount.c6
-rwxr-xr-xtools/kvm/kvm_stat/kvm_stat2
-rw-r--r--tools/testing/selftests/mm/protection_keys.c4
-rw-r--r--tools/testing/selftests/user_events/Makefile2
-rw-r--r--tools/testing/selftests/user_events/abi_test.c229
-rw-r--r--tools/testing/selftests/user_events/dyn_test.c2
-rw-r--r--tools/testing/selftests/user_events/ftrace_test.c176
-rw-r--r--tools/testing/selftests/user_events/perf_test.c39
35 files changed, 1959 insertions, 518 deletions
diff --git a/Documentation/trace/fprobe.rst b/Documentation/trace/fprobe.rst
index b64bec1ce144..40dd2fbce861 100644
--- a/Documentation/trace/fprobe.rst
+++ b/Documentation/trace/fprobe.rst
@@ -87,14 +87,16 @@ returns as same as unregister_ftrace_function().
The fprobe entry/exit handler
=============================
-The prototype of the entry/exit callback function is as follows:
+The prototype of the entry/exit callback function are as follows:
.. code-block:: c
- void callback_func(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs);
+ int entry_callback(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs, void *entry_data);
-Note that both entry and exit callbacks have same ptototype. The @entry_ip is
-saved at function entry and passed to exit handler.
+ void exit_callback(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs, void *entry_data);
+
+Note that the @entry_ip is saved at function entry and passed to exit handler.
+If the entry callback function returns !0, the corresponding exit callback will be cancelled.
@fp
This is the address of `fprobe` data structure related to this handler.
@@ -113,6 +115,12 @@ saved at function entry and passed to exit handler.
to use @entry_ip. On the other hand, in the exit_handler, the instruction
pointer of @regs is set to the currect return address.
+@entry_data
+ This is a local storage to share the data between entry and exit handlers.
+ This storage is NULL by default. If the user specify `exit_handler` field
+ and `entry_data_size` field when registering the fprobe, the storage is
+ allocated and passed to both `entry_handler` and `exit_handler`.
+
Share the callbacks with kprobes
================================
diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index e8bca5fea7cc..a9c8bce4bc7b 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -1027,6 +1027,7 @@ To see what is available, simply cat the file::
nohex
nobin
noblock
+ nofields
trace_printk
annotate
nouserstacktrace
@@ -1110,6 +1111,11 @@ Here are the available options:
block
When set, reading trace_pipe will not block when polled.
+ fields
+ Print the fields as described by their types. This is a better
+ option than using hex, bin or raw, as it gives a better parsing
+ of the content of the event.
+
trace_printk
Can disable trace_printk() from writing into the buffer.
diff --git a/Documentation/trace/user_events.rst b/Documentation/trace/user_events.rst
index 422802ef4025..f79987e16cf4 100644
--- a/Documentation/trace/user_events.rst
+++ b/Documentation/trace/user_events.rst
@@ -20,11 +20,10 @@ dynamic_events is the same as the ioctl with the u: prefix applied.
Typically programs will register a set of events that they wish to expose to
tools that can read trace_events (such as ftrace and perf). The registration
-process gives back two ints to the program for each event. The first int is
-the status bit. This describes which bit in little-endian format in the
-/sys/kernel/tracing/user_events_status file represents this event. The
-second int is the write index which describes the data when a write() or
-writev() is called on the /sys/kernel/tracing/user_events_data file.
+process tells the kernel which address and bit to reflect if any tool has
+enabled the event and data should be written. The registration will give back
+a write index which describes the data when a write() or writev() is called
+on the /sys/kernel/tracing/user_events_data file.
The structures referenced in this document are contained within the
/include/uapi/linux/user_events.h file in the source tree.
@@ -41,23 +40,64 @@ DIAG_IOCSREG.
This command takes a packed struct user_reg as an argument::
struct user_reg {
- u32 size;
- u64 name_args;
- u32 status_bit;
- u32 write_index;
- };
+ /* Input: Size of the user_reg structure being used */
+ __u32 size;
+
+ /* Input: Bit in enable address to use */
+ __u8 enable_bit;
+
+ /* Input: Enable size in bytes at address */
+ __u8 enable_size;
+
+ /* Input: Flags for future use, set to 0 */
+ __u16 flags;
+
+ /* Input: Address to update when enabled */
+ __u64 enable_addr;
+
+ /* Input: Pointer to string with event name, description and flags */
+ __u64 name_args;
+
+ /* Output: Index of the event to use when writing data */
+ __u32 write_index;
+ } __attribute__((__packed__));
+
+The struct user_reg requires all the above inputs to be set appropriately.
+
++ size: This must be set to sizeof(struct user_reg).
-The struct user_reg requires two inputs, the first is the size of the structure
-to ensure forward and backward compatibility. The second is the command string
-to issue for registering. Upon success two outputs are set, the status bit
-and the write index.
++ enable_bit: The bit to reflect the event status at the address specified by
+ enable_addr.
+
++ enable_size: The size of the value specified by enable_addr.
+ This must be 4 (32-bit) or 8 (64-bit). 64-bit values are only allowed to be
+ used on 64-bit kernels, however, 32-bit can be used on all kernels.
+
++ flags: The flags to use, if any. For the initial version this must be 0.
+ Callers should first attempt to use flags and retry without flags to ensure
+ support for lower versions of the kernel. If a flag is not supported -EINVAL
+ is returned.
+
++ enable_addr: The address of the value to use to reflect event status. This
+ must be naturally aligned and write accessible within the user program.
+
++ name_args: The name and arguments to describe the event, see command format
+ for details.
+
+Upon successful registration the following is set.
+
++ write_index: The index to use for this file descriptor that represents this
+ event when writing out data. The index is unique to this instance of the file
+ descriptor that was used for the registration. See writing data for details.
User based events show up under tracefs like any other event under the
subsystem named "user_events". This means tools that wish to attach to the
events need to use /sys/kernel/tracing/events/user_events/[name]/enable
or perf record -e user_events:[name] when attaching/recording.
-**NOTE:** *The write_index returned is only valid for the FD that was used*
+**NOTE:** The event subsystem name by default is "user_events". Callers should
+not assume it will always be "user_events". Operators reserve the right in the
+future to change the subsystem name per-process to accomodate event isolation.
Command Format
^^^^^^^^^^^^^^
@@ -94,7 +134,7 @@ Would be represented by the following field::
struct mytype myname 20
Deleting
------------
+--------
Deleting an event from within a user process is done via ioctl() out to the
/sys/kernel/tracing/user_events_data file. The command to issue is
DIAG_IOCSDEL.
@@ -104,92 +144,79 @@ its name. Delete will only succeed if there are no references left to the
event (in both user and kernel space). User programs should use a separate file
to request deletes than the one used for registration due to this.
-Status
-------
-When tools attach/record user based events the status of the event is updated
-in realtime. This allows user programs to only incur the cost of the write() or
-writev() calls when something is actively attached to the event.
-
-User programs call mmap() on /sys/kernel/tracing/user_events_status to
-check the status for each event that is registered. The bit to check in the
-file is given back after the register ioctl() via user_reg.status_bit. The bit
-is always in little-endian format. Programs can check if the bit is set either
-using a byte-wise index with a mask or a long-wise index with a little-endian
-mask.
+Unregistering
+-------------
+If after registering an event it is no longer wanted to be updated then it can
+be disabled via ioctl() out to the /sys/kernel/tracing/user_events_data file.
+The command to issue is DIAG_IOCSUNREG. This is different than deleting, where
+deleting actually removes the event from the system. Unregistering simply tells
+the kernel your process is no longer interested in updates to the event.
-Currently the size of user_events_status is a single page, however, custom
-kernel configurations can change this size to allow more user based events. In
-all cases the size of the file is a multiple of a page size.
+This command takes a packed struct user_unreg as an argument::
-For example, if the register ioctl() gives back a status_bit of 3 you would
-check byte 0 (3 / 8) of the returned mmap data and then AND the result with 8
-(1 << (3 % 8)) to see if anything is attached to that event.
+ struct user_unreg {
+ /* Input: Size of the user_unreg structure being used */
+ __u32 size;
-A byte-wise index check is performed as follows::
+ /* Input: Bit to unregister */
+ __u8 disable_bit;
- int index, mask;
- char *status_page;
+ /* Input: Reserved, set to 0 */
+ __u8 __reserved;
- index = status_bit / 8;
- mask = 1 << (status_bit % 8);
-
- ...
+ /* Input: Reserved, set to 0 */
+ __u16 __reserved2;
- if (status_page[index] & mask) {
- /* Enabled */
- }
+ /* Input: Address to unregister */
+ __u64 disable_addr;
+ } __attribute__((__packed__));
-A long-wise index check is performed as follows::
+The struct user_unreg requires all the above inputs to be set appropriately.
- #include <asm/bitsperlong.h>
- #include <endian.h>
++ size: This must be set to sizeof(struct user_unreg).
- #if __BITS_PER_LONG == 64
- #define endian_swap(x) htole64(x)
- #else
- #define endian_swap(x) htole32(x)
- #endif
++ disable_bit: This must be set to the bit to disable (same bit that was
+ previously registered via enable_bit).
- long index, mask, *status_page;
++ disable_addr: This must be set to the address to disable (same address that was
+ previously registered via enable_addr).
- index = status_bit / __BITS_PER_LONG;
- mask = 1L << (status_bit % __BITS_PER_LONG);
- mask = endian_swap(mask);
+**NOTE:** Events are automatically unregistered when execve() is invoked. During
+fork() the registered events will be retained and must be unregistered manually
+in each process if wanted.
- ...
+Status
+------
+When tools attach/record user based events the status of the event is updated
+in realtime. This allows user programs to only incur the cost of the write() or
+writev() calls when something is actively attached to the event.
- if (status_page[index] & mask) {
- /* Enabled */
- }
+The kernel will update the specified bit that was registered for the event as
+tools attach/detach from the event. User programs simply check if the bit is set
+to see if something is attached or not.
Administrators can easily check the status of all registered events by reading
the user_events_status file directly via a terminal. The output is as follows::
- Byte:Name [# Comments]
+ Name [# Comments]
...
Active: ActiveCount
Busy: BusyCount
- Max: MaxCount
For example, on a system that has a single event the output looks like this::
- 1:test
+ test
Active: 1
Busy: 0
- Max: 32768
If a user enables the user event via ftrace, the output would change to this::
- 1:test # Used by ftrace
+ test # Used by ftrace
Active: 1
Busy: 1
- Max: 32768
-
-**NOTE:** *A status bit of 0 will never be returned. This allows user programs
-to have a bit that can be used on error cases.*
Writing Data
------------
@@ -217,7 +244,7 @@ For example, if I have a struct like this::
int src;
int dst;
int flags;
- };
+ } __attribute__((__packed__));
It's advised for user programs to do the following::
diff --git a/fs/exec.c b/fs/exec.c
index 87cf3a2f0e9a..a466e797c8e2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -65,6 +65,7 @@
#include <linux/syscall_user_dispatch.h>
#include <linux/coredump.h>
#include <linux/time_namespace.h>
+#include <linux/user_events.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -1859,6 +1860,7 @@ static int bprm_execve(struct linux_binprm *bprm,
current->fs->in_exec = 0;
current->in_execve = 0;
rseq_execve(current);
+ user_events_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
return retval;
diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h
index 1c2bde0ead73..47fefc7f363b 100644
--- a/include/linux/fprobe.h
+++ b/include/linux/fprobe.h
@@ -13,6 +13,8 @@
* @nmissed: The counter for missing events.
* @flags: The status flag.
* @rethook: The rethook data structure. (internal data)
+ * @entry_data_size: The private data storage size.
+ * @nr_maxactive: The max number of active functions.
* @entry_handler: The callback function for function entry.
* @exit_handler: The callback function for function exit.
*/
@@ -29,9 +31,13 @@ struct fprobe {
unsigned long nmissed;
unsigned int flags;
struct rethook *rethook;
+ size_t entry_data_size;
+ int nr_maxactive;
- void (*entry_handler)(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs);
- void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs);
+ int (*entry_handler)(struct fprobe *fp, unsigned long entry_ip,
+ struct pt_regs *regs, void *entry_data);
+ void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip,
+ struct pt_regs *regs, void *entry_data);
};
/* This fprobe is soft-disabled. */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 3e56cb6f40d1..6954e4ed5bbf 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -548,6 +548,7 @@ bool is_ftrace_trampoline(unsigned long addr);
* DIRECT - there is a direct function to call
* CALL_OPS - the record can use callsite-specific ops
* CALL_OPS_EN - the function is set up to use callsite-specific ops
+ * TOUCHED - A callback was added since boot up
*
* When a new ftrace_ops is registered and wants a function to save
* pt_regs, the rec->flags REGS is set. When the function has been
@@ -567,9 +568,10 @@ enum {
FTRACE_FL_DIRECT_EN = (1UL << 23),
FTRACE_FL_CALL_OPS = (1UL << 22),
FTRACE_FL_CALL_OPS_EN = (1UL << 21),
+ FTRACE_FL_TOUCHED = (1UL << 20),
};
-#define FTRACE_REF_MAX_SHIFT 21
+#define FTRACE_REF_MAX_SHIFT 20
#define FTRACE_REF_MAX ((1UL << FTRACE_REF_MAX_SHIFT) - 1)
#define ftrace_rec_count(rec) ((rec)->flags & FTRACE_REF_MAX)
@@ -628,6 +630,7 @@ enum {
FTRACE_ITER_PROBE = (1 << 4),
FTRACE_ITER_MOD = (1 << 5),
FTRACE_ITER_ENABLED = (1 << 6),
+ FTRACE_ITER_TOUCHED = (1 << 7),
};
void arch_ftrace_update_code(int command);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dc4ad4c58fae..eed5d65b8d1f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -70,6 +70,7 @@ struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
+struct user_event_mm;
/*
* Task state bitmask. NOTE! These bits are also
@@ -1529,6 +1530,10 @@ struct task_struct {
union rv_task_monitor rv[RV_PER_TASK_MONITORS];
#endif
+#ifdef CONFIG_USER_EVENTS
+ struct user_event_mm *user_event_mm;
+#endif
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index 5b31c5147969..515d7fcb9634 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -159,4 +159,6 @@ extern int
seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary);
#endif
+void seq_buf_do_printk(struct seq_buf *s, const char *lvl);
+
#endif /* _LINUX_SEQ_BUF_H */
diff --git a/include/linux/user_events.h b/include/linux/user_events.h
index 592a3fbed98e..2847f5a18a86 100644
--- a/include/linux/user_events.h
+++ b/include/linux/user_events.h
@@ -1,54 +1,83 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright (c) 2021, Microsoft Corporation.
+ * Copyright (c) 2022, Microsoft Corporation.
*
* Authors:
* Beau Belgrave <beaub@linux.microsoft.com>
*/
-#ifndef _UAPI_LINUX_USER_EVENTS_H
-#define _UAPI_LINUX_USER_EVENTS_H
-#include <linux/types.h>
-#include <linux/ioctl.h>
+#ifndef _LINUX_USER_EVENTS_H
+#define _LINUX_USER_EVENTS_H
-#ifdef __KERNEL__
-#include <linux/uio.h>
-#else
-#include <sys/uio.h>
-#endif
+#include <linux/list.h>
+#include <linux/refcount.h>
+#include <linux/mm_types.h>
+#include <linux/workqueue.h>
+#include <uapi/linux/user_events.h>
-#define USER_EVENTS_SYSTEM "user_events"
-#define USER_EVENTS_PREFIX "u:"
+#ifdef CONFIG_USER_EVENTS
+struct user_event_mm {
+ struct list_head link;
+ struct list_head enablers;
+ struct mm_struct *mm;
+ struct user_event_mm *next;
+ refcount_t refcnt;
+ refcount_t tasks;
+ struct rcu_work put_rwork;
+};
-/* Create dynamic location entry within a 32-bit value */
-#define DYN_LOC(offset, size) ((size) << 16 | (offset))
+extern void user_event_mm_dup(struct task_struct *t,
+ struct user_event_mm *old_mm);
-/*
- * Describes an event registration and stores the results of the registration.
- * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum
- * must set the size and name_args before invocation.
- */
-struct user_reg {
+extern void user_event_mm_remove(struct task_struct *t);
+
+static inline void user_events_fork(struct task_struct *t,
+ unsigned long clone_flags)
+{
+ struct user_event_mm *old_mm;
- /* Input: Size of the user_reg structure being used */
- __u32 size;
+ if (!t || !current->user_event_mm)
+ return;
- /* Input: Pointer to string with event name, description and flags */
- __u64 name_args;
+ old_mm = current->user_event_mm;
- /* Output: Bitwise index of the event within the status page */
- __u32 status_bit;
+ if (clone_flags & CLONE_VM) {
+ t->user_event_mm = old_mm;
+ refcount_inc(&old_mm->tasks);
+ return;
+ }
- /* Output: Index of the event to use when writing data */
- __u32 write_index;
-} __attribute__((__packed__));
+ user_event_mm_dup(t, old_mm);
+}
-#define DIAG_IOC_MAGIC '*'
+static inline void user_events_execve(struct task_struct *t)
+{
+ if (!t || !t->user_event_mm)
+ return;
+
+ user_event_mm_remove(t);
+}
+
+static inline void user_events_exit(struct task_struct *t)
+{
+ if (!t || !t->user_event_mm)
+ return;
+
+ user_event_mm_remove(t);
+}
+#else
+static inline void user_events_fork(struct task_struct *t,
+ unsigned long clone_flags)
+{
+}
-/* Requests to register a user_event */
-#define DIAG_IOCSREG _IOWR(DIAG_IOC_MAGIC, 0, struct user_reg*)
+static inline void user_events_execve(struct task_struct *t)
+{
+}
-/* Requests to delete a user_event */
-#define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char*)
+static inline void user_events_exit(struct task_struct *t)
+{
+}
+#endif /* CONFIG_USER_EVENTS */
-#endif /* _UAPI_LINUX_USER_EVENTS_H */
+#endif /* _LINUX_USER_EVENTS_H */
diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
new file mode 100644
index 000000000000..2984aae4a2b4
--- /dev/null
+++ b/include/uapi/linux/user_events.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (c) 2021-2022, Microsoft Corporation.
+ *
+ * Authors:
+ * Beau Belgrave <beaub@linux.microsoft.com>
+ */
+#ifndef _UAPI_LINUX_USER_EVENTS_H
+#define _UAPI_LINUX_USER_EVENTS_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define USER_EVENTS_SYSTEM "user_events"
+#define USER_EVENTS_PREFIX "u:"
+
+/* Create dynamic location entry within a 32-bit value */
+#define DYN_LOC(offset, size) ((size) << 16 | (offset))
+
+/*
+ * Describes an event registration and stores the results of the registration.
+ * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum
+ * must set the size and name_args before invocation.
+ */
+struct user_reg {
+
+ /* Input: Size of the user_reg structure being used */
+ __u32 size;
+
+ /* Input: Bit in enable address to use */
+ __u8 enable_bit;
+
+ /* Input: Enable size in bytes at address */
+ __u8 enable_size;
+
+ /* Input: Flags for future use, set to 0 */
+ __u16 flags;
+
+ /* Input: Address to update when enabled */
+ __u64 enable_addr;
+
+ /* Input: Pointer to string with event name, description and flags */
+ __u64 name_args;
+
+ /* Output: Index of the event to use when writing data */
+ __u32 write_index;
+} __attribute__((__packed__));
+
+/*
+ * Describes an event unregister, callers must set the size, address and bit.
+ * This structure is passed to the DIAG_IOCSUNREG ioctl to disable bit updates.
+ */
+struct user_unreg {
+ /* Input: Size of the user_unreg structure being used */
+ __u32 size;
+
+ /* Input: Bit to unregister */
+ __u8 disable_bit;
+
+ /* Input: Reserved, set to 0 */
+ __u8 __reserved;
+
+ /* Input: Reserved, set to 0 */
+ __u16 __reserved2;
+
+ /* Input: Address to unregister */
+ __u64 disable_addr;
+} __attribute__((__packed__));
+
+#define DIAG_IOC_MAGIC '*'
+
+/* Request to register a user_event */
+#define DIAG_IOCSREG _IOWR(DIAG_IOC_MAGIC, 0, struct user_reg *)
+
+/* Request to delete a user_event */
+#define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char *)
+
+/* Requests to unregister a user_event */
+#define DIAG_IOCSUNREG _IOW(DIAG_IOC_MAGIC, 2, struct user_unreg*)
+
+#endif /* _UAPI_LINUX_USER_EVENTS_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 86902cb5ab78..34b90e2e7cf7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -68,6 +68,7 @@
#include <linux/kprobes.h>
#include <linux/rethook.h>
#include <linux/sysfs.h>
+#include <linux/user_events.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -818,6 +819,7 @@ void __noreturn do_exit(long code)
coredump_task_exit(tsk);
ptrace_event(PTRACE_EVENT_EXIT, code);
+ user_events_exit(tsk);
validate_creds_for_do_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index eccb35a85216..735d9f4f5acf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -97,6 +97,7 @@
#include <linux/io_uring.h>
#include <linux/bpf.h>
#include <linux/stackprotector.h>
+#include <linux/user_events.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -2735,6 +2736,7 @@ __latent_entropy struct task_struct *copy_process(
trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags);
+ user_events_fork(p, clone_flags);
copy_oom_score_adj(clone_flags, p);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5b1e7fa41ca8..8cf97fa4a4b3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -792,15 +792,15 @@ config USER_EVENTS
bool "User trace events"
select TRACING
select DYNAMIC_EVENTS
- depends on BROKEN || COMPILE_TEST # API needs to be straighten out
help
User trace events are user-defined trace events that
can be used like an existing kernel trace event. User trace
events are generated by writing to a tracefs file. User
processes can determine if their tracing events should be
- generated by memory mapping a tracefs file and checking for
- an associated byte being non-zero.
+ generated by registering a value and bit with the kernel
+ that reflects when it is enabled or not.
+ See Documentation/trace/user_events.rst.
If in doubt, say N.
config HIST_TRIGGERS
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index bcf91bc7bf71..9a050e36dc6c 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2640,9 +2640,20 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
return err;
}
-static void
+static int
kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
- struct pt_regs *regs)
+ struct pt_regs *regs, void *data)
+{
+ struct bpf_kprobe_multi_link *link;
+
+ link = container_of(fp, struct bpf_kprobe_multi_link, fp);
+ kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
+ return 0;
+}
+
+static void
+kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip,
+ struct pt_regs *regs, void *data)
{
struct bpf_kprobe_multi_link *link;
@@ -2844,7 +2855,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
goto error;
if (flags & BPF_F_KPROBE_MULTI_RETURN)
- link->fp.exit_handler = kprobe_multi_link_handler;
+ link->fp.exit_handler = kprobe_multi_link_exit_handler;
else
link->fp.entry_handler = kprobe_multi_link_handler;
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index e8143e368074..9abb3905bc8e 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -17,15 +17,17 @@
struct fprobe_rethook_node {
struct rethook_node node;
unsigned long entry_ip;
+ char data[];
};
static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
struct fprobe_rethook_node *fpr;
- struct rethook_node *rh;
+ struct rethook_node *rh = NULL;
struct fprobe *fp;
- int bit;
+ void *entry_data = NULL;
+ int bit, ret;
fp = container_of(ops, struct fprobe, ops);
if (fprobe_disabled(fp))
@@ -37,9 +39,6 @@ static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
return;
}
- if (fp->entry_handler)
- fp->entry_handler(fp, ip, ftrace_get_regs(fregs));
-
if (fp->exit_handler) {
rh = rethook_try_get(fp->rethook);
if (!rh) {
@@ -48,9 +47,20 @@ static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
}
fpr = container_of(rh, struct fprobe_rethook_node, node);
fpr->entry_ip = ip;
- rethook_hook(rh, ftrace_get_regs(fregs), true);
+ if (fp->entry_data_size)
+ entry_data = fpr->data;
}
+ if (fp->entry_handler)
+ ret = fp->entry_handler(fp, ip, ftrace_get_regs(fregs), entry_data);
+
+ /* If entry_handler returns !0, nmissed is not counted. */
+ if (rh) {
+ if (ret)
+ rethook_recycle(rh);
+ else
+ rethook_hook(rh, ftrace_get_regs(fregs), true);
+ }
out:
ftrace_test_recursion_unlock(bit);
}
@@ -81,7 +91,8 @@ static void fprobe_exit_handler(struct rethook_node *rh, void *data,
fpr = container_of(rh, struct fprobe_rethook_node, node);
- fp->exit_handler(fp, fpr->entry_ip, regs);
+ fp->exit_handler(fp, fpr->entry_ip, regs,
+ fp->entry_data_size ? (void *)fpr->data : NULL);
}
NOKPROBE_SYMBOL(fprobe_exit_handler);
@@ -136,7 +147,10 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
}
/* Initialize rethook if needed */
- size = num * num_possible_cpus() * 2;
+ if (fp->nr_maxactive)
+ size = fp->nr_maxactive;
+ else
+ size = num * num_possible_cpus() * 2;
if (size < 0)
return -E2BIG;
@@ -146,7 +160,7 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
for (i = 0; i < size; i++) {
struct fprobe_rethook_node *node;
- node = kzalloc(sizeof(*node), GFP_KERNEL);
+ node = kzalloc(sizeof(*node) + fp->entry_data_size, GFP_KERNEL);
if (!node) {
rethook_free(fp->rethook);
fp->rethook = NULL;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 08155f652554..76973a70ab9d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -45,6 +45,9 @@
#include "trace_output.h"
#include "trace_stat.h"
+/* Flags that do not get reset */
+#define FTRACE_NOCLEAR_FLAGS (FTRACE_FL_DISABLED | FTRACE_FL_TOUCHED)
+
#define FTRACE_INVALID_FUNCTION "__ftrace_invalid_address__"
#define FTRACE_WARN_ON(cond) \
@@ -2256,7 +2259,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
flag ^= rec->flags & FTRACE_FL_ENABLED;
if (update) {
- rec->flags |= FTRACE_FL_ENABLED;
+ rec->flags |= FTRACE_FL_ENABLED | FTRACE_FL_TOUCHED;
if (flag & FTRACE_FL_REGS) {
if (rec->flags & FTRACE_FL_REGS)
rec->flags |= FTRACE_FL_REGS_EN;
@@ -2326,7 +2329,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
if (update) {
/* If there's no more users, clear all flags */
if (!ftrace_rec_count(rec))
- rec->flags &= FTRACE_FL_DISABLED;
+ rec->flags &= FTRACE_NOCLEAR_FLAGS;
else
/*
* Just disable the record, but keep the ops TRAMP
@@ -3147,7 +3150,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
struct dyn_ftrace *rec;
do_for_each_ftrace_rec(pg, rec) {
- if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED))
+ if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_NOCLEAR_FLAGS))
pr_warn(" %pS flags:%lx\n",
(void *)rec->ip, rec->flags);
} while_for_each_ftrace_rec();
@@ -3598,7 +3601,10 @@ t_func_next(struct seq_file *m, loff_t *pos)
!ftrace_lookup_ip(iter->hash, rec->ip)) ||
((iter->flags & FTRACE_ITER_ENABLED) &&
- !(rec->flags & FTRACE_FL_ENABLED))) {
+ !(rec->flags & FTRACE_FL_ENABLED)) ||
+
+ ((iter->flags & FTRACE_ITER_TOUCHED) &&
+ !(rec->flags & FTRACE_FL_TOUCHED))) {
rec = NULL;
goto retry;
@@ -3857,7 +3863,7 @@ static int t_show(struct seq_file *m, void *v)
return 0;
}
- if (iter->flags & FTRACE_ITER_ENABLED) {
+ if (iter->flags & (FTRACE_ITER_ENABLED | FTRACE_ITER_TOUCHED)) {
struct ftrace_ops *ops;
seq_printf(m, " (%ld)%s%s%s%s",
@@ -3959,6 +3965,31 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
return 0;
}
+static int
+ftrace_touched_open(struct inode *inode, struct file *file)
+{
+ struct ftrace_iterator *iter;
+
+ /*
+ * This shows us what functions have ever been enabled
+ * (traced, direct, patched, etc). Not sure if we want lockdown
+ * to hide such critical information for an admin.
+ * Although, perhaps it can show information we don't
+ * want people to see, but if something had traced
+ * something, we probably want to know about it.
+ */
+
+ iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
+ if (!iter)
+ return -ENOMEM;
+
+ iter->pg = ftrace_pages_start;
+ iter->flags = FTRACE_ITER_TOUCHED;
+ iter->ops = &global_ops;
+
+ return 0;
+}
+
/**
* ftrace_regex_open - initialize function tracer filter files
* @ops: The ftrace_ops that hold the hash filters
@@ -5872,6 +5903,13 @@ static const struct file_operations ftrace_enabled_fops = {
.release = seq_release_private,
};
+static const struct file_operations ftrace_touched_fops = {
+ .open = ftrace_touched_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
static const struct file_operations ftrace_filter_fops = {
.open = ftrace_filter_open,
.read = seq_read,
@@ -6336,6 +6374,9 @@ static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
trace_create_file("enabled_functions", TRACE_MODE_READ,
d_tracer, NULL, &ftrace_enabled_fops);
+ trace_create_file("touched_functions", TRACE_MODE_READ,
+ d_tracer, NULL, &ftrace_touched_fops);
+
ftrace_create_filter_files(&global_ops, d_tracer);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 76a2d91eecad..834b361a4a66 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -163,7 +163,7 @@ enum {
#define extended_time(event) \
(event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
-static inline int rb_null_event(struct ring_buffer_event *event)
+static inline bool rb_null_event(struct ring_buffer_event *event)
{
return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
}
@@ -363,11 +363,9 @@ static void free_buffer_page(struct buffer_page *bpage)
/*
* We need to fit the time_stamp delta into 27 bits.
*/
-static inline int test_time_stamp(u64 delta)
+static inline bool test_time_stamp(u64 delta)
{
- if (delta & TS_DELTA_TEST)
- return 1;
- return 0;
+ return !!(delta & TS_DELTA_TEST);
}
#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
@@ -696,7 +694,7 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
return ret == expect;
}
-static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
{
unsigned long cnt, top, bottom, msb;
unsigned long cnt2, top2, bottom2, msb2;
@@ -1486,7 +1484,7 @@ rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
return NULL;
}
-static int rb_head_page_replace(struct buffer_page *old,
+static bool rb_head_page_replace(struct buffer_page *old,
struct buffer_page *new)
{
unsigned long *ptr = (unsigned long *)&old->list.prev->next;
@@ -1565,15 +1563,12 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
}
}
-static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
+static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage)
{
unsigned long val = (unsigned long)bpage;
- if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
- return 1;
-
- return 0;
+ RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK);
}
/**
@@ -1583,30 +1578,28 @@ static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
* As a safety measure we check to make sure the data pages have not
* been corrupted.
*/
-static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
+static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
struct list_head *head = rb_list_head(cpu_buffer->pages);
struct list_head *tmp;
if (RB_WARN_ON(cpu_buffer,
rb_list_head(rb_list_head(head->next)->prev) != head))
- return -1;
+ return;
if (RB_WARN_ON(cpu_buffer,
rb_list_head(rb_list_head(head->prev)->next) != head))
- return -1;
+ return;
for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) {
if (RB_WARN_ON(cpu_buffer,
rb_list_head(rb_list_head(tmp->next)->prev) != tmp))
- return -1;
+ return;
if (RB_WARN_ON(cpu_buffer,
rb_list_head(rb_list_head(tmp->prev)->next) != tmp))
- return -1;
+ return;
}
-
- return 0;
}
static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
@@ -1774,6 +1767,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
struct list_head *head = cpu_buffer->pages;
struct buffer_page *bpage, *tmp;
+ irq_work_sync(&cpu_buffer->irq_work.work);
+
free_buffer_page(cpu_buffer->reader_page);
if (head) {
@@ -1880,6 +1875,8 @@ ring_buffer_free(struct trace_buffer *buffer)
cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
+ irq_work_sync(&buffer->irq_work.work);
+
for_each_buffer_cpu(buffer, cpu)
rb_free_cpu_buffer(buffer->buffers[cpu]);
@@ -1918,7 +1915,7 @@ static inline unsigned long rb_page_write(struct buffer_page *bpage)
return local_read(&bpage->write) & RB_WRITE_MASK;
}
-static int
+static bool
rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
{
struct list_head *tail_page, *to_remove, *next_page;
@@ -2031,12 +2028,13 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
return nr_removed == 0;
}
-static int
+static bool
rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
struct list_head *pages = &cpu_buffer->new_pages;
- int retries, success;
unsigned long flags;
+ bool success;
+ int retries;
/* Can be called at early boot up, where interrupts must not been enabled */
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
@@ -2055,15 +2053,16 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
* spinning.
*/
retries = 10;
- success = 0;
+ success = false;
while (retries--) {
struct list_head *head_page, *prev_page, *r;
struct list_head *last_page, *first_page;
struct list_head *head_page_with_bit;
+ struct buffer_page *hpage = rb_set_head_page(cpu_buffer);
- head_page = &rb_set_head_page(cpu_buffer)->list;
- if (!head_page)
+ if (!hpage)
break;
+ head_page = &hpage->list;
prev_page = head_page->prev;
first_page = pages->next;
@@ -2084,7 +2083,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
* pointer to point to end of list
*/
head_page->prev = last_page;
- success = 1;
+ success = true;
break;
}
}
@@ -2112,7 +2111,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
- int success;
+ bool success;
if (cpu_buffer->nr_pages_to_update > 0)
success = rb_insert_pages(cpu_buffer);
@@ -2995,7 +2994,7 @@ static u64 rb_time_delta(struct ring_buffer_event *event)
}
}
-static inline int
+static inline bool
rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
@@ -3016,7 +3015,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
delta = rb_time_delta(event);
if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
- return 0;
+ return false;
/* Make sure the write stamp is read before testing the location */
barrier();
@@ -3029,7 +3028,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
/* Something came in, can't discard */
if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
write_stamp, write_stamp - delta))
- return 0;
+ return false;
/*
* It's possible that the event time delta is zero
@@ -3062,12 +3061,12 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
if (index == old_index) {
/* update counters */
local_sub(event_length, &cpu_buffer->entries_bytes);
- return 1;
+ return true;
}
}
/* could not discard */
- return 0;
+ return false;
}
static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
@@ -3288,7 +3287,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* Note: The TRANSITION bit only handles a single transition between context.
*/
-static __always_inline int
+static __always_inline bool
trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned int val = cpu_buffer->current_context;
@@ -3305,14 +3304,14 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
bit = RB_CTX_TRANSITION;
if (val & (1 << (bit + cpu_buffer->nest))) {
do_ring_buffer_record_recursion();
- return 1;
+ return true;
}
}
val |= (1 << (bit + cpu_buffer->nest));
cpu_buffer->current_context = val;
- return 0;
+ return false;
}
static __always_inline void
@@ -4069,10 +4068,10 @@ void ring_buffer_record_off(struct trace_buffer *buffer)
unsigned int rd;
unsigned int new_rd;
+ rd = atomic_read(&buffer->record_disabled);
do {
- rd = atomic_read(&buffer->record_disabled);
new_rd = rd | RB_BUFFER_OFF;
- } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+ } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd));
}
EXPORT_SYMBOL_GPL(ring_buffer_record_off);
@@ -4092,10 +4091,10 @@ void ring_buffer_record_on(struct trace_buffer *buffer)
unsigned int rd;
unsigned int new_rd;
+ rd = atomic_read(&buffer->record_disabled);
do {
- rd = atomic_read(&buffer->record_disabled);
new_rd = rd & ~RB_BUFFER_OFF;
- } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+ } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd));
}
EXPORT_SYMBOL_GPL(ring_buffer_record_on);
@@ -4502,7 +4501,6 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
default:
RB_WARN_ON(cpu_buffer, 1);
}
- return;
}
static void
@@ -4533,7 +4531,6 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
default:
RB_WARN_ON(iter->cpu_buffer, 1);
}
- return;
}
static struct buffer_page *
@@ -4543,7 +4540,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
unsigned long overwrite;
unsigned long flags;
int nr_loops = 0;
- int ret;
+ bool ret;
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
@@ -4953,7 +4950,6 @@ rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked)
{
if (likely(locked))
raw_spin_unlock(&cpu_buffer->reader_lock);
- return;
}
/**
@@ -5345,6 +5341,9 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
+/* Flag to ensure proper resetting of atomic variables */
+#define RESET_BIT (1 << 30)
+
/**
* ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer
* @buffer: The ring buffer to reset a per cpu buffer of
@@ -5361,20 +5360,27 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
for_each_online_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
- atomic_inc(&cpu_buffer->resize_disabled);
+ atomic_add(RESET_BIT, &cpu_buffer->resize_disabled);
atomic_inc(&cpu_buffer->record_disabled);
}
/* Make sure all commits have finished */
synchronize_rcu();
- for_each_online_buffer_cpu(buffer, cpu) {
+ for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
+ /*
+ * If a CPU came online during the synchronize_rcu(), then
+ * ignore it.
+ */
+ if (!(atomic_read(&cpu_buffer->resize_disabled) & RESET_BIT))
+ continue;
+
reset_disabled_cpu_buffer(cpu_buffer);
atomic_dec(&cpu_buffer->record_disabled);
- atomic_dec(&cpu_buffer->resize_disabled);
+ atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled);
}
mutex_unlock(&buffer->mutex);
@@ -5424,8 +5430,8 @@ bool ring_buffer_empty(struct trace_buffer *buffer)
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
bool dolock;
+ bool ret;
int cpu;
- int ret;
/* yes this is racy, but if you don't like the race, lock the buffer */
for_each_buffer_cpu(buffer, cpu) {
@@ -5454,7 +5460,7 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu)
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
bool dolock;
- int ret;
+ bool ret;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return true;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 36a6037823cd..427da2341bf0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3726,7 +3726,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
#define STATIC_FMT_BUF_SIZE 128
static char static_fmt_buf[STATIC_FMT_BUF_SIZE];
-static char *trace_iter_expand_format(struct trace_iterator *iter)
+char *trace_iter_expand_format(struct trace_iterator *iter)
{
char *tmp;
@@ -4446,8 +4446,11 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
if (trace_seq_has_overflowed(s))
return TRACE_TYPE_PARTIAL_LINE;
- if (event)
+ if (event) {
+ if (tr->trace_flags & TRACE_ITER_FIELDS)
+ return print_event_fields(iter, event);
return event->funcs->trace(iter, sym_flags, event);
+ }
trace_seq_printf(s, "Unknown type %d\n", entry->type);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 616e1aa1c4da..79bdefe9261b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -619,6 +619,7 @@ bool trace_is_tracepoint_string(const char *str);
const char *trace_event_format(struct trace_iterator *iter, const char *fmt);
void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
va_list ap) __printf(2, 0);
+char *trace_iter_expand_format(struct trace_iterator *iter);
int trace_empty(struct trace_iterator *iter);
@@ -1199,6 +1200,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(HEX, "hex"), \
C(BIN, "bin"), \
C(BLOCK, "block"), \
+ C(FIELDS, "fields"), \
C(PRINTK, "trace_printk"), \
C(ANNOTATE, "annotate"), \
C(USERSTACKTRACE, "userstacktrace"), \
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 908e8a13c675..b1ecd7677642 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -19,14 +19,12 @@
#include <linux/tracefs.h>
#include <linux/types.h>
#include <linux/uaccess.h>
-/* Reminder to move to uapi when everything works */
-#ifdef CONFIG_COMPILE_TEST
+#include <linux/highmem.h>
+#include <linux/init.h>
#include <linux/user_events.h>
-#else
-#include <uapi/linux/user_events.h>
-#endif
-#include "trace.h"
#include "trace_dynevent.h"
+#include "trace_output.h"
+#include "trace.h"
#define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1)
@@ -34,35 +32,12 @@
#define FIELD_DEPTH_NAME 1
#define FIELD_DEPTH_SIZE 2
-/*
- * Limits how many trace_event calls user processes can create:
- * Must be a power of two of PAGE_SIZE.
- */
-#define MAX_PAGE_ORDER 0
-#define MAX_PAGES (1 << MAX_PAGE_ORDER)
-#define MAX_BYTES (MAX_PAGES * PAGE_SIZE)
-#define MAX_EVENTS (MAX_BYTES * 8)
-
/* Limit how long of an event name plus args within the subsystem. */
#define MAX_EVENT_DESC 512
#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
#define MAX_FIELD_ARRAY_SIZE 1024
/*
- * The MAP_STATUS_* macros are used for taking a index and determining the
- * appropriate byte and the bit in the byte to set/reset for an event.
- *
- * The lower 3 bits of the index decide which bit to set.
- * The remaining upper bits of the index decide which byte to use for the bit.
- *
- * This is used when an event has a probe attached/removed to reflect live
- * status of the event wanting tracing or not to user-programs via shared
- * memory maps.
- */
-#define MAP_STATUS_BYTE(index) ((index) >> 3)
-#define MAP_STATUS_MASK(index) BIT((index) & 7)
-
-/*
* Internal bits (kernel side only) to keep track of connected probes:
* These are used when status is requested in text form about an event. These
* bits are compared against an internal byte on the event to determine which
@@ -75,25 +50,25 @@
#define EVENT_STATUS_OTHER BIT(7)
/*
- * Stores the pages, tables, and locks for a group of events.
- * Each logical grouping of events has its own group, with a
- * matching page for status checks within user programs. This
- * allows for isolation of events to user programs by various
- * means.
+ * Stores the system name, tables, and locks for a group of events. This
+ * allows isolation for events by various means.
*/
struct user_event_group {
- struct page *pages;
- char *register_page_data;
- char *system_name;
- struct hlist_node node;
- struct mutex reg_mutex;
+ char *system_name;
+ struct hlist_node node;
+ struct mutex reg_mutex;
DECLARE_HASHTABLE(register_table, 8);
- DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
};
/* Group for init_user_ns mapping, top-most group */
static struct user_event_group *init_group;
+/* Max allowed events for the whole system */
+static unsigned int max_user_events = 32768;
+
+/* Current number of events on the whole system */
+static unsigned int current_user_events;
+
/*
* Stores per-event properties, as users register events
* within a file a user_event might be created if it does not
@@ -102,21 +77,61 @@ static struct user_event_group *init_group;
* refcnt reaches one.
*/
struct user_event {
- struct user_event_group *group;
- struct tracepoint tracepoint;
- struct trace_event_call call;
- struct trace_event_class class;
- struct dyn_event devent;
- struct hlist_node node;
- struct list_head fields;
- struct list_head validators;
- refcount_t refcnt;
- int index;
- int flags;
- int min_size;
- char status;
+ struct user_event_group *group;
+ struct tracepoint tracepoint;
+ struct trace_event_call call;
+ struct trace_event_class class;
+ struct dyn_event devent;
+ struct hlist_node node;
+ struct list_head fields;
+ struct list_head validators;
+ refcount_t refcnt;
+ int min_size;
+ char status;
+};
+
+/*
+ * Stores per-mm/event properties that enable an address to be
+ * updated properly for each task. As tasks are forked, we use
+ * these to track enablement sites that are tied to an event.
+ */
+struct user_event_enabler {
+ struct list_head link;
+ struct user_event *event;
+ unsigned long addr;
+
+ /* Track enable bit, flags, etc. Aligned for bitops. */
+ unsigned int values;
+};
+
+/* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */
+#define ENABLE_VAL_BIT_MASK 0x3F
+
+/* Bit 6 is for faulting status of enablement */
+#define ENABLE_VAL_FAULTING_BIT 6
+
+/* Bit 7 is for freeing status of enablement */
+#define ENABLE_VAL_FREEING_BIT 7
+
+/* Only duplicate the bit value */
+#define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK
+
+#define ENABLE_BITOPS(e) ((unsigned long *)&(e)->values)
+
+/* Used for asynchronous faulting in of pages */
+struct user_event_enabler_fault {
+ struct work_struct work;
+ struct user_event_mm *mm;
+ struct user_event_enabler *enabler;
+ int attempt;
};
+static struct kmem_cache *fault_cache;
+
+/* Global list of memory descriptors using user_events */
+static LIST_HEAD(user_event_mms);
+static DEFINE_SPINLOCK(user_event_mms_lock);
+
/*
* Stores per-file events references, as users register events
* within a file this structure is modified and freed via RCU.
@@ -124,23 +139,23 @@ struct user_event {
* These are not shared and only accessible by the file that created it.
*/
struct user_event_refs {
- struct rcu_head rcu;
- int count;
- struct user_event *events[];
+ struct rcu_head rcu;
+ int count;
+ struct user_event *events[];
};
struct user_event_file_info {
- struct user_event_group *group;
- struct user_event_refs *refs;
+ struct user_event_group *group;
+ struct user_event_refs *refs;
};
#define VALIDATOR_ENSURE_NULL (1 << 0)
#define VALIDATOR_REL (1 << 1)
struct user_event_validator {
- struct list_head link;
- int offset;
- int flags;
+ struct list_head link;
+ int offset;
+ int flags;
};
typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
@@ -150,33 +165,17 @@ static int user_event_parse(struct user_event_group *group, char *name,
char *args, char *flags,
struct user_event **newuser);
+static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm);
+static struct user_event_mm *user_event_mm_get_all(struct user_event *user);
+static void user_event_mm_put(struct user_event_mm *mm);
+
static u32 user_event_key(char *name)
{
return jhash(name, strlen(name), 0);
}
-static void set_page_reservations(char *pages, bool set)
-{
- int page;
-
- for (page = 0; page < MAX_PAGES; ++page) {
- void *addr = pages + (PAGE_SIZE * page);
-
- if (set)
- SetPageReserved(virt_to_page(addr));
- else
- ClearPageReserved(virt_to_page(addr));
- }
-}
-
static void user_event_group_destroy(struct user_event_group *group)
{
- if (group->register_page_data)
- set_page_reservations(group->register_page_data, false);
-
- if (group->pages)
- __free_pages(group->pages, MAX_PAGE_ORDER);
-
kfree(group->system_name);
kfree(group);
}
@@ -247,19 +246,6 @@ static struct user_event_group
if (!group->system_name)
goto error;
- group->pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
-
- if (!group->pages)
- goto error;
-
- group->register_page_data = page_address(group->pages);
-
- set_page_reservations(group->register_page_data, true);
-
- /* Zero all bits beside 0 (which is reserved for failures) */
- bitmap_zero(group->page_bitmap, MAX_EVENTS);
- set_bit(0, group->page_bitmap);
-
mutex_init(&group->reg_mutex);
hash_init(group->register_table);
@@ -271,20 +257,514 @@ error:
return NULL;
};
-static __always_inline
-void user_event_register_set(struct user_event *user)
+static void user_event_enabler_destroy(struct user_event_enabler *enabler)
+{
+ list_del_rcu(&enabler->link);
+
+ /* No longer tracking the event via the enabler */
+ refcount_dec(&enabler->event->refcnt);
+
+ kfree(enabler);
+}
+
+static int user_event_mm_fault_in(struct user_event_mm *mm, unsigned long uaddr,
+ int attempt)
+{
+ bool unlocked;
+ int ret;
+
+ /*
+ * Normally this is low, ensure that it cannot be taken advantage of by
+ * bad user processes to cause excessive looping.
+ */
+ if (attempt > 10)
+ return -EFAULT;
+
+ mmap_read_lock(mm->mm);
+
+ /* Ensure MM has tasks, cannot use after exit_mm() */
+ if (refcount_read(&mm->tasks) == 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = fixup_user_fault(mm->mm, uaddr, FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
+ &unlocked);
+out:
+ mmap_read_unlock(mm->mm);
+
+ return ret;
+}
+
+static int user_event_enabler_write(struct user_event_mm *mm,
+ struct user_event_enabler *enabler,
+ bool fixup_fault, int *attempt);
+
+static void user_event_enabler_fault_fixup(struct work_struct *work)
+{
+ struct user_event_enabler_fault *fault = container_of(
+ work, struct user_event_enabler_fault, work);
+ struct user_event_enabler *enabler = fault->enabler;
+ struct user_event_mm *mm = fault->mm;
+ unsigned long uaddr = enabler->addr;
+ int attempt = fault->attempt;
+ int ret;
+
+ ret = user_event_mm_fault_in(mm, uaddr, attempt);
+
+ if (ret && ret != -ENOENT) {
+ struct user_event *user = enabler->event;
+
+ pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n",
+ mm->mm, (unsigned long long)uaddr, EVENT_NAME(user));
+ }
+
+ /* Prevent state changes from racing */
+ mutex_lock(&event_mutex);
+
+ /* User asked for enabler to be removed during fault */
+ if (test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))) {
+ user_event_enabler_destroy(enabler);
+ goto out;
+ }
+
+ /*
+ * If we managed to get the page, re-issue the write. We do not
+ * want to get into a possible infinite loop, which is why we only
+ * attempt again directly if the page came in. If we couldn't get
+ * the page here, then we will try again the next time the event is
+ * enabled/disabled.
+ */
+ clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
+
+ if (!ret) {
+ mmap_read_lock(mm->mm);
+ user_event_enabler_write(mm, enabler, true, &attempt);
+ mmap_read_unlock(mm->mm);
+ }
+out:
+ mutex_unlock(&event_mutex);
+
+ /* In all cases we no longer need the mm or fault */
+ user_event_mm_put(mm);
+ kmem_cache_free(fault_cache, fault);
+}
+
+static bool user_event_enabler_queue_fault(struct user_event_mm *mm,
+ struct user_event_enabler *enabler,
+ int attempt)
+{
+ struct user_event_enabler_fault *fault;
+
+ fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN);
+
+ if (!fault)
+ return false;
+
+ INIT_WORK(&fault->work, user_event_enabler_fault_fixup);
+ fault->mm = user_event_mm_get(mm);
+ fault->enabler = enabler;
+ fault->attempt = attempt;
+
+ /* Don't try to queue in again while we have a pending fault */
+ set_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
+
+ if (!schedule_work(&fault->work)) {
+ /* Allow another attempt later */
+ clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
+
+ user_event_mm_put(mm);
+ kmem_cache_free(fault_cache, fault);
+
+ return false;
+ }
+
+ return true;
+}
+
+static int user_event_enabler_write(struct user_event_mm *mm,
+ struct user_event_enabler *enabler,
+ bool fixup_fault, int *attempt)
+{
+ unsigned long uaddr = enabler->addr;
+ unsigned long *ptr;
+ struct page *page;
+ void *kaddr;
+ int ret;
+
+ lockdep_assert_held(&event_mutex);
+ mmap_assert_locked(mm->mm);
+
+ *attempt += 1;
+
+ /* Ensure MM has tasks, cannot use after exit_mm() */
+ if (refcount_read(&mm->tasks) == 0)
+ return -ENOENT;
+
+ if (unlikely(test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)) ||
+ test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))))
+ return -EBUSY;
+
+ ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
+ &page, NULL, NULL);
+
+ if (unlikely(ret <= 0)) {
+ if (!fixup_fault)
+ return -EFAULT;
+
+ if (!user_event_enabler_queue_fault(mm, enabler, *attempt))
+ pr_warn("user_events: Unable to queue fault handler\n");
+
+ return -EFAULT;
+ }
+
+ kaddr = kmap_local_page(page);
+ ptr = kaddr + (uaddr & ~PAGE_MASK);
+
+ /* Update bit atomically, user tracers must be atomic as well */
+ if (enabler->event && enabler->event->status)
+ set_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr);
+ else
+ clear_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr);
+
+ kunmap_local(kaddr);
+ unpin_user_pages_dirty_lock(&page, 1, true);
+
+ return 0;
+}
+
+static bool user_event_enabler_exists(struct user_event_mm *mm,
+ unsigned long uaddr, unsigned char bit)
{
- int i = user->index;
+ struct user_event_enabler *enabler;
+ struct user_event_enabler *next;
- user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i);
+ list_for_each_entry_safe(enabler, next, &mm->enablers, link) {
+ if (enabler->addr == uaddr &&
+ (enabler->values & ENABLE_VAL_BIT_MASK) == bit)
+ return true;
+ }
+
+ return false;
}
-static __always_inline
-void user_event_register_clear(struct user_event *user)
+static void user_event_enabler_update(struct user_event *user)
{
- int i = user->index;
+ struct user_event_enabler *enabler;
+ struct user_event_mm *mm = user_event_mm_get_all(user);
+ struct user_event_mm *next;
+ int attempt;
+
+ while (mm) {
+ next = mm->next;
+ mmap_read_lock(mm->mm);
+ rcu_read_lock();
- user->group->register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i);
+ list_for_each_entry_rcu(enabler, &mm->enablers, link) {
+ if (enabler->event == user) {
+ attempt = 0;
+ user_event_enabler_write(mm, enabler, true, &attempt);
+ }
+ }
+
+ rcu_read_unlock();
+ mmap_read_unlock(mm->mm);
+ user_event_mm_put(mm);
+ mm = next;
+ }
+}
+
+static bool user_event_enabler_dup(struct user_event_enabler *orig,
+ struct user_event_mm *mm)
+{
+ struct user_event_enabler *enabler;
+
+ /* Skip pending frees */
+ if (unlikely(test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(orig))))
+ return true;
+
+ enabler = kzalloc(sizeof(*enabler), GFP_NOWAIT | __GFP_ACCOUNT);
+
+ if (!enabler)
+ return false;
+
+ enabler->event = orig->event;
+ enabler->addr = orig->addr;
+
+ /* Only dup part of value (ignore future flags, etc) */
+ enabler->values = orig->values & ENABLE_VAL_DUP_MASK;
+
+ refcount_inc(&enabler->event->refcnt);
+ list_add_rcu(&enabler->link, &mm->enablers);
+
+ return true;
+}
+
+static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm)
+{
+ refcount_inc(&mm->refcnt);
+
+ return mm;
+}
+
+static struct user_event_mm *user_event_mm_get_all(struct user_event *user)
+{
+ struct user_event_mm *found = NULL;
+ struct user_event_enabler *enabler;
+ struct user_event_mm *mm;
+
+ /*
+ * We do not want to block fork/exec while enablements are being
+ * updated, so we use RCU to walk the current tasks that have used
+ * user_events ABI for 1 or more events. Each enabler found in each
+ * task that matches the event being updated has a write to reflect
+ * the kernel state back into the process. Waits/faults must not occur
+ * during this. So we scan the list under RCU for all the mm that have
+ * the event within it. This is needed because mm_read_lock() can wait.
+ * Each user mm returned has a ref inc to handle remove RCU races.
+ */
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(mm, &user_event_mms, link)
+ list_for_each_entry_rcu(enabler, &mm->enablers, link)
+ if (enabler->event == user) {
+ mm->next = found;
+ found = user_event_mm_get(mm);
+ break;
+ }
+
+ rcu_read_unlock();
+
+ return found;
+}
+
+static struct user_event_mm *user_event_mm_create(struct task_struct *t)
+{
+ struct user_event_mm *user_mm;
+ unsigned long flags;
+
+ user_mm = kzalloc(sizeof(*user_mm), GFP_KERNEL_ACCOUNT);
+
+ if (!user_mm)
+ return NULL;
+
+ user_mm->mm = t->mm;
+ INIT_LIST_HEAD(&user_mm->enablers);
+ refcount_set(&user_mm->refcnt, 1);
+ refcount_set(&user_mm->tasks, 1);
+
+ spin_lock_irqsave(&user_event_mms_lock, flags);
+ list_add_rcu(&user_mm->link, &user_event_mms);
+ spin_unlock_irqrestore(&user_event_mms_lock, flags);
+
+ t->user_event_mm = user_mm;
+
+ /*
+ * The lifetime of the memory descriptor can slightly outlast
+ * the task lifetime if a ref to the user_event_mm is taken
+ * between list_del_rcu() and call_rcu(). Therefore we need
+ * to take a reference to it to ensure it can live this long
+ * under this corner case. This can also occur in clones that
+ * outlast the parent.
+ */
+ mmgrab(user_mm->mm);
+
+ return user_mm;
+}
+
+static struct user_event_mm *current_user_event_mm(void)
+{
+ struct user_event_mm *user_mm = current->user_event_mm;
+
+ if (user_mm)
+ goto inc;
+
+ user_mm = user_event_mm_create(current);
+
+ if (!user_mm)
+ goto error;
+inc:
+ refcount_inc(&user_mm->refcnt);
+error:
+ return user_mm;
+}
+
+static void user_event_mm_destroy(struct user_event_mm *mm)
+{
+ struct user_event_enabler *enabler, *next;
+
+ list_for_each_entry_safe(enabler, next, &mm->enablers, link)
+ user_event_enabler_destroy(enabler);
+
+ mmdrop(mm->mm);
+ kfree(mm);
+}
+
+static void user_event_mm_put(struct user_event_mm *mm)
+{
+ if (mm && refcount_dec_and_test(&mm->refcnt))
+ user_event_mm_destroy(mm);
+}
+
+static void delayed_user_event_mm_put(struct work_struct *work)
+{
+ struct user_event_mm *mm;
+
+ mm = container_of(to_rcu_work(work), struct user_event_mm, put_rwork);
+ user_event_mm_put(mm);
+}
+
+void user_event_mm_remove(struct task_struct *t)
+{
+ struct user_event_mm *mm;
+ unsigned long flags;
+
+ might_sleep();
+
+ mm = t->user_event_mm;
+ t->user_event_mm = NULL;
+
+ /* Clone will increment the tasks, only remove if last clone */
+ if (!refcount_dec_and_test(&mm->tasks))
+ return;
+
+ /* Remove the mm from the list, so it can no longer be enabled */
+ spin_lock_irqsave(&user_event_mms_lock, flags);
+ list_del_rcu(&mm->link);
+ spin_unlock_irqrestore(&user_event_mms_lock, flags);
+
+ /*
+ * We need to wait for currently occurring writes to stop within
+ * the mm. This is required since exit_mm() snaps the current rss
+ * stats and clears them. On the final mmdrop(), check_mm() will
+ * report a bug if these increment.
+ *
+ * All writes/pins are done under mmap_read lock, take the write
+ * lock to ensure in-progress faults have completed. Faults that
+ * are pending but yet to run will check the task count and skip
+ * the fault since the mm is going away.
+ */
+ mmap_write_lock(mm->mm);
+ mmap_write_unlock(mm->mm);
+
+ /*
+ * Put for mm must be done after RCU delay to handle new refs in
+ * between the list_del_rcu() and now. This ensures any get refs
+ * during rcu_read_lock() are accounted for during list removal.
+ *
+ * CPU A | CPU B
+ * ---------------------------------------------------------------
+ * user_event_mm_remove() | rcu_read_lock();
+ * list_del_rcu() | list_for_each_entry_rcu();
+ * call_rcu() | refcount_inc();
+ * . | rcu_read_unlock();
+ * schedule_work() | .
+ * user_event_mm_put() | .
+ *
+ * mmdrop() cannot be called in the softirq context of call_rcu()
+ * so we use a work queue after call_rcu() to run within.
+ */
+ INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put);
+ queue_rcu_work(system_wq, &mm->put_rwork);
+}
+
+void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm)
+{
+ struct user_event_mm *mm = user_event_mm_create(t);
+ struct user_event_enabler *enabler;
+
+ if (!mm)
+ return;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(enabler, &old_mm->enablers, link)
+ if (!user_event_enabler_dup(enabler, mm))
+ goto error;
+
+ rcu_read_unlock();
+
+ return;
+error:
+ rcu_read_unlock();
+ user_event_mm_remove(t);
+}
+
+static bool current_user_event_enabler_exists(unsigned long uaddr,
+ unsigned char bit)
+{
+ struct user_event_mm *user_mm = current_user_event_mm();
+ bool exists;
+
+ if (!user_mm)
+ return false;
+
+ exists = user_event_enabler_exists(user_mm, uaddr, bit);
+
+ user_event_mm_put(user_mm);
+
+ return exists;
+}
+
+static struct user_event_enabler
+*user_event_enabler_create(struct user_reg *reg, struct user_event *user,
+ int *write_result)
+{
+ struct user_event_enabler *enabler;
+ struct user_event_mm *user_mm;
+ unsigned long uaddr = (unsigned long)reg->enable_addr;
+ int attempt = 0;
+
+ user_mm = current_user_event_mm();
+
+ if (!user_mm)
+ return NULL;
+
+ enabler = kzalloc(sizeof(*enabler), GFP_KERNEL_ACCOUNT);
+
+ if (!enabler)
+ goto out;
+
+ enabler->event = user;
+ enabler->addr = uaddr;
+ enabler->values = reg->enable_bit;
+retry:
+ /* Prevents state changes from racing with new enablers */
+ mutex_lock(&event_mutex);
+
+ /* Attempt to reflect the current state within the process */
+ mmap_read_lock(user_mm->mm);
+ *write_result = user_event_enabler_write(user_mm, enabler, false,
+ &attempt);
+ mmap_read_unlock(user_mm->mm);
+
+ /*
+ * If the write works, then we will track the enabler. A ref to the
+ * underlying user_event is held by the enabler to prevent it going
+ * away while the enabler is still in use by a process. The ref is
+ * removed when the enabler is destroyed. This means a event cannot
+ * be forcefully deleted from the system until all tasks using it
+ * exit or run exec(), which includes forks and clones.
+ */
+ if (!*write_result) {
+ refcount_inc(&enabler->event->refcnt);
+ list_add_rcu(&enabler->link, &user_mm->enablers);
+ }
+
+ mutex_unlock(&event_mutex);
+
+ if (*write_result) {
+ /* Attempt to fault-in and retry if it worked */
+ if (!user_event_mm_fault_in(user_mm, uaddr, attempt))
+ goto retry;
+
+ kfree(enabler);
+ enabler = NULL;
+ }
+out:
+ user_event_mm_put(user_mm);
+
+ return enabler;
}
static __always_inline __must_check
@@ -449,7 +929,7 @@ static int user_event_add_field(struct user_event *user, const char *type,
struct ftrace_event_field *field;
int validator_flags = 0;
- field = kmalloc(sizeof(*field), GFP_KERNEL);
+ field = kmalloc(sizeof(*field), GFP_KERNEL_ACCOUNT);
if (!field)
return -ENOMEM;
@@ -468,7 +948,7 @@ add_validator:
if (strstr(type, "char") != NULL)
validator_flags |= VALIDATOR_ENSURE_NULL;
- validator = kmalloc(sizeof(*validator), GFP_KERNEL);
+ validator = kmalloc(sizeof(*validator), GFP_KERNEL_ACCOUNT);
if (!validator) {
kfree(field);
@@ -489,6 +969,9 @@ add_field:
field->is_signed = is_signed;
field->filter_type = filter_type;
+ if (filter_type == FILTER_OTHER)
+ field->filter_type = filter_assign_type(type);
+
list_add(&field->link, &user->fields);
/*
@@ -754,7 +1237,7 @@ static int user_event_create_print_fmt(struct user_event *user)
len = user_event_set_print_fmt(user, NULL, 0);
- print_fmt = kmalloc(len, GFP_KERNEL);
+ print_fmt = kmalloc(len, GFP_KERNEL_ACCOUNT);
if (!print_fmt)
return -ENOMEM;
@@ -770,11 +1253,7 @@ static enum print_line_t user_event_print_trace(struct trace_iterator *iter,
int flags,
struct trace_event *event)
{
- /* Unsafe to try to decode user provided print_fmt, use hex */
- trace_print_hex_dump_seq(&iter->seq, "", DUMP_PREFIX_OFFSET, 16,
- 1, iter->ent, iter->ent_size, true);
-
- return trace_handle_return(&iter->seq);
+ return print_event_fields(iter, event);
}
static struct trace_event_functions user_event_funcs = {
@@ -820,6 +1299,8 @@ static int destroy_user_event(struct user_event *user)
{
int ret = 0;
+ lockdep_assert_held(&event_mutex);
+
/* Must destroy fields before call removal */
user_event_destroy_fields(user);
@@ -829,9 +1310,6 @@ static int destroy_user_event(struct user_event *user)
return ret;
dyn_event_remove(&user->devent);
-
- user_event_register_clear(user);
- clear_bit(user->index, user->group->page_bitmap);
hash_del(&user->node);
user_event_destroy_validators(user);
@@ -839,6 +1317,11 @@ static int destroy_user_event(struct user_event *user)
kfree(EVENT_NAME(user));
kfree(user);
+ if (current_user_events > 0)
+ current_user_events--;
+ else
+ pr_alert("BUG: Bad current_user_events\n");
+
return ret;
}
@@ -977,9 +1460,9 @@ discard:
#endif
/*
- * Update the register page that is shared between user processes.
+ * Update the enabled bit among all user processes.
*/
-static void update_reg_page_for(struct user_event *user)
+static void update_enable_bit_for(struct user_event *user)
{
struct tracepoint *tp = &user->tracepoint;
char status = 0;
@@ -1010,12 +1493,9 @@ static void update_reg_page_for(struct user_event *user)
rcu_read_unlock_sched();
}
- if (status)
- user_event_register_set(user);
- else
- user_event_register_clear(user);
-
user->status = status;
+
+ user_event_enabler_update(user);
}
/*
@@ -1072,10 +1552,10 @@ static int user_event_reg(struct trace_event_call *call,
return ret;
inc:
refcount_inc(&user->refcnt);
- update_reg_page_for(user);
+ update_enable_bit_for(user);
return 0;
dec:
- update_reg_page_for(user);
+ update_enable_bit_for(user);
refcount_dec(&user->refcnt);
return 0;
}
@@ -1093,7 +1573,7 @@ static int user_event_create(const char *raw_command)
raw_command += USER_EVENTS_PREFIX_LEN;
raw_command = skip_spaces(raw_command);
- name = kstrdup(raw_command, GFP_KERNEL);
+ name = kstrdup(raw_command, GFP_KERNEL_ACCOUNT);
if (!name)
return -ENOMEM;
@@ -1271,7 +1751,6 @@ static int user_event_parse(struct user_event_group *group, char *name,
struct user_event **newuser)
{
int ret;
- int index;
u32 key;
struct user_event *user;
@@ -1290,12 +1769,7 @@ static int user_event_parse(struct user_event_group *group, char *name,
return 0;
}
- index = find_first_zero_bit(group->page_bitmap, MAX_EVENTS);
-
- if (index == MAX_EVENTS)
- return -EMFILE;
-
- user = kzalloc(sizeof(*user), GFP_KERNEL);
+ user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT);
if (!user)
return -ENOMEM;
@@ -1335,20 +1809,23 @@ static int user_event_parse(struct user_event_group *group, char *name,
mutex_lock(&event_mutex);
+ if (current_user_events >= max_user_events) {
+ ret = -EMFILE;
+ goto put_user_lock;
+ }
+
ret = user_event_trace_register(user);
if (ret)
goto put_user_lock;
- user->index = index;
-
/* Ensure we track self ref and caller ref (2) */
refcount_set(&user->refcnt, 2);
dyn_event_init(&user->devent, &user_event_dops);
dyn_event_add(&user->devent, &user->call);
- set_bit(user->index, group->page_bitmap);
hash_add(group->register_table, &user->node, key);
+ current_user_events++;
mutex_unlock(&event_mutex);
@@ -1398,6 +1875,9 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx)))
return -EFAULT;
+ if (idx < 0)
+ return -EINVAL;
+
rcu_read_lock_sched();
refs = rcu_dereference_sched(info->refs);
@@ -1468,7 +1948,7 @@ static int user_events_open(struct inode *node, struct file *file)
if (!group)
return -ENOENT;
- info = kzalloc(sizeof(*info), GFP_KERNEL);
+ info = kzalloc(sizeof(*info), GFP_KERNEL_ACCOUNT);
if (!info)
return -ENOMEM;
@@ -1521,7 +2001,7 @@ static int user_events_ref_add(struct user_event_file_info *info,
size = struct_size(refs, events, count + 1);
- new_refs = kzalloc(size, GFP_KERNEL);
+ new_refs = kzalloc(size, GFP_KERNEL_ACCOUNT);
if (!new_refs)
return -ENOMEM;
@@ -1564,6 +2044,37 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
if (ret)
return ret;
+ /* Ensure no flags, since we don't support any yet */
+ if (kreg->flags != 0)
+ return -EINVAL;
+
+ /* Ensure supported size */
+ switch (kreg->enable_size) {
+ case 4:
+ /* 32-bit */
+ break;
+#if BITS_PER_LONG >= 64
+ case 8:
+ /* 64-bit */
+ break;
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ /* Ensure natural alignment */
+ if (kreg->enable_addr % kreg->enable_size)
+ return -EINVAL;
+
+ /* Ensure bit range for size */
+ if (kreg->enable_bit > (kreg->enable_size * BITS_PER_BYTE) - 1)
+ return -EINVAL;
+
+ /* Ensure accessible */
+ if (!access_ok((const void __user *)(uintptr_t)kreg->enable_addr,
+ kreg->enable_size))
+ return -EFAULT;
+
kreg->size = size;
return 0;
@@ -1578,14 +2089,26 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
struct user_reg __user *ureg = (struct user_reg __user *)uarg;
struct user_reg reg;
struct user_event *user;
+ struct user_event_enabler *enabler;
char *name;
long ret;
+ int write_result;
ret = user_reg_get(ureg, &reg);
if (ret)
return ret;
+ /*
+ * Prevent users from using the same address and bit multiple times
+ * within the same mm address space. This can cause unexpected behavior
+ * for user processes that is far easier to debug if this is explictly
+ * an error upon registering.
+ */
+ if (current_user_event_enabler_exists((unsigned long)reg.enable_addr,
+ reg.enable_bit))
+ return -EADDRINUSE;
+
name = strndup_user((const char __user *)(uintptr_t)reg.name_args,
MAX_EVENT_DESC);
@@ -1610,8 +2133,28 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
if (ret < 0)
return ret;
+ /*
+ * user_events_ref_add succeeded:
+ * At this point we have a user_event, it's lifetime is bound by the
+ * reference count, not this file. If anything fails, the user_event
+ * still has a reference until the file is released. During release
+ * any remaining references (from user_events_ref_add) are decremented.
+ *
+ * Attempt to create an enabler, which too has a lifetime tied in the
+ * same way for the event. Once the task that caused the enabler to be
+ * created exits or issues exec() then the enablers it has created
+ * will be destroyed and the ref to the event will be decremented.
+ */
+ enabler = user_event_enabler_create(&reg, user, &write_result);
+
+ if (!enabler)
+ return -ENOMEM;
+
+ /* Write failed/faulted, give error back to caller */
+ if (write_result)
+ return write_result;
+
put_user((u32)ret, &ureg->write_index);
- put_user(user->index, &ureg->status_bit);
return 0;
}
@@ -1641,6 +2184,114 @@ static long user_events_ioctl_del(struct user_event_file_info *info,
return ret;
}
+static long user_unreg_get(struct user_unreg __user *ureg,
+ struct user_unreg *kreg)
+{
+ u32 size;
+ long ret;
+
+ ret = get_user(size, &ureg->size);
+
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE)
+ return -E2BIG;
+
+ if (size < offsetofend(struct user_unreg, disable_addr))
+ return -EINVAL;
+
+ ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+
+ /* Ensure no reserved values, since we don't support any yet */
+ if (kreg->__reserved || kreg->__reserved2)
+ return -EINVAL;
+
+ return ret;
+}
+
+static int user_event_mm_clear_bit(struct user_event_mm *user_mm,
+ unsigned long uaddr, unsigned char bit)
+{
+ struct user_event_enabler enabler;
+ int result;
+ int attempt = 0;
+
+ memset(&enabler, 0, sizeof(enabler));
+ enabler.addr = uaddr;
+ enabler.values = bit;
+retry:
+ /* Prevents state changes from racing with new enablers */
+ mutex_lock(&event_mutex);
+
+ /* Force the bit to be cleared, since no event is attached */
+ mmap_read_lock(user_mm->mm);
+ result = user_event_enabler_write(user_mm, &enabler, false, &attempt);
+ mmap_read_unlock(user_mm->mm);
+
+ mutex_unlock(&event_mutex);
+
+ if (result) {
+ /* Attempt to fault-in and retry if it worked */
+ if (!user_event_mm_fault_in(user_mm, uaddr, attempt))
+ goto retry;
+ }
+
+ return result;
+}
+
+/*
+ * Unregisters an enablement address/bit within a task/user mm.
+ */
+static long user_events_ioctl_unreg(unsigned long uarg)
+{
+ struct user_unreg __user *ureg = (struct user_unreg __user *)uarg;
+ struct user_event_mm *mm = current->user_event_mm;
+ struct user_event_enabler *enabler, *next;
+ struct user_unreg reg;
+ long ret;
+
+ ret = user_unreg_get(ureg, &reg);
+
+ if (ret)
+ return ret;
+
+ if (!mm)
+ return -ENOENT;
+
+ ret = -ENOENT;
+
+ /*
+ * Flags freeing and faulting are used to indicate if the enabler is in
+ * use at all. When faulting is set a page-fault is occurring asyncly.
+ * During async fault if freeing is set, the enabler will be destroyed.
+ * If no async fault is happening, we can destroy it now since we hold
+ * the event_mutex during these checks.
+ */
+ mutex_lock(&event_mutex);
+
+ list_for_each_entry_safe(enabler, next, &mm->enablers, link)
+ if (enabler->addr == reg.disable_addr &&
+ (enabler->values & ENABLE_VAL_BIT_MASK) == reg.disable_bit) {
+ set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler));
+
+ if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)))
+ user_event_enabler_destroy(enabler);
+
+ /* Removed at least one */
+ ret = 0;
+ }
+
+ mutex_unlock(&event_mutex);
+
+ /* Ensure bit is now cleared for user, regardless of event status */
+ if (!ret)
+ ret = user_event_mm_clear_bit(mm, reg.disable_addr,
+ reg.disable_bit);
+
+ return ret;
+}
+
/*
* Handles the ioctl from user mode to register or alter operations.
*/
@@ -1663,6 +2314,12 @@ static long user_events_ioctl(struct file *file, unsigned int cmd,
ret = user_events_ioctl_del(info, uarg);
mutex_unlock(&group->reg_mutex);
break;
+
+ case DIAG_IOCSUNREG:
+ mutex_lock(&group->reg_mutex);
+ ret = user_events_ioctl_unreg(uarg);
+ mutex_unlock(&group->reg_mutex);
+ break;
}
return ret;
@@ -1718,45 +2375,13 @@ out:
}
static const struct file_operations user_data_fops = {
- .open = user_events_open,
- .write = user_events_write,
- .write_iter = user_events_write_iter,
+ .open = user_events_open,
+ .write = user_events_write,
+ .write_iter = user_events_write_iter,
.unlocked_ioctl = user_events_ioctl,
- .release = user_events_release,
+ .release = user_events_release,
};
-static struct user_event_group *user_status_group(struct file *file)
-{
- struct seq_file *m = file->private_data;
-
- if (!m)
- return NULL;
-
- return m->private;
-}
-
-/*
- * Maps the shared page into the user process for checking if event is enabled.
- */
-static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
-{
- char *pages;
- struct user_event_group *group = user_status_group(file);
- unsigned long size = vma->vm_end - vma->vm_start;
-
- if (size != MAX_BYTES)
- return -EINVAL;
-
- if (!group)
- return -EINVAL;
-
- pages = group->register_page_data;
-
- return remap_pfn_range(vma, vma->vm_start,
- virt_to_phys(pages) >> PAGE_SHIFT,
- size, vm_get_page_prot(VM_READ));
-}
-
static void *user_seq_start(struct seq_file *m, loff_t *pos)
{
if (*pos)
@@ -1780,7 +2405,7 @@ static int user_seq_show(struct seq_file *m, void *p)
struct user_event_group *group = m->private;
struct user_event *user;
char status;
- int i, active = 0, busy = 0, flags;
+ int i, active = 0, busy = 0;
if (!group)
return -EINVAL;
@@ -1789,11 +2414,10 @@ static int user_seq_show(struct seq_file *m, void *p)
hash_for_each(group->register_table, i, user, node) {
status = user->status;
- flags = user->flags;
- seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
+ seq_printf(m, "%s", EVENT_NAME(user));
- if (flags != 0 || status != 0)
+ if (status != 0)
seq_puts(m, " #");
if (status != 0) {
@@ -1816,16 +2440,15 @@ static int user_seq_show(struct seq_file *m, void *p)
seq_puts(m, "\n");
seq_printf(m, "Active: %d\n", active);
seq_printf(m, "Busy: %d\n", busy);
- seq_printf(m, "Max: %ld\n", MAX_EVENTS);
return 0;
}
static const struct seq_operations user_seq_ops = {
- .start = user_seq_start,
- .next = user_seq_next,
- .stop = user_seq_stop,
- .show = user_seq_show,
+ .start = user_seq_start,
+ .next = user_seq_next,
+ .stop = user_seq_stop,
+ .show = user_seq_show,
};
static int user_status_open(struct inode *node, struct file *file)
@@ -1851,11 +2474,10 @@ static int user_status_open(struct inode *node, struct file *file)
}
static const struct file_operations user_status_fops = {
- .open = user_status_open,
- .mmap = user_status_mmap,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
+ .open = user_status_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
};
/*
@@ -1873,8 +2495,7 @@ static int create_user_tracefs(void)
goto err;
}
- /* mmap with MAP_SHARED requires writable fd */
- emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE,
+ emmap = tracefs_create_file("user_events_status", TRACE_MODE_READ,
NULL, NULL, &user_status_fops);
if (!emmap) {
@@ -1888,20 +2509,53 @@ err:
return -ENODEV;
}
+static int set_max_user_events_sysctl(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ mutex_lock(&event_mutex);
+
+ ret = proc_douintvec(table, write, buffer, lenp, ppos);
+
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+static struct ctl_table user_event_sysctls[] = {
+ {
+ .procname = "user_events_max",
+ .data = &max_user_events,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = set_max_user_events_sysctl,
+ },
+ {}
+};
+
static int __init trace_events_user_init(void)
{
int ret;
+ fault_cache = KMEM_CACHE(user_event_enabler_fault, 0);
+
+ if (!fault_cache)
+ return -ENOMEM;
+
init_group = user_event_group_create(&init_user_ns);
- if (!init_group)
+ if (!init_group) {
+ kmem_cache_destroy(fault_cache);
return -ENOMEM;
+ }
ret = create_user_tracefs();
if (ret) {
pr_warn("user_events could not register with tracefs\n");
user_event_group_destroy(init_group);
+ kmem_cache_destroy(fault_cache);
init_group = NULL;
return ret;
}
@@ -1909,6 +2563,8 @@ static int __init trace_events_user_init(void)
if (dyn_event_register(&user_event_dops))
pr_warn("user_events could not register with dyn_events\n");
+ register_sysctl_init("kernel", user_event_sysctls);
+
return 0;
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index bd475a00f96d..15f05faaae44 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -221,8 +221,11 @@ trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len,
const char *ret = trace_seq_buffer_ptr(p);
const char *fmt = concatenate ? "%*phN" : "%*ph";
- for (i = 0; i < buf_len; i += 16)
+ for (i = 0; i < buf_len; i += 16) {
+ if (!concatenate && i != 0)
+ trace_seq_putc(p, ' ');
trace_seq_printf(p, fmt, min(buf_len - i, 16), &buf[i]);
+ }
trace_seq_putc(p, 0);
return ret;
@@ -808,6 +811,176 @@ EXPORT_SYMBOL_GPL(unregister_trace_event);
* Standard events
*/
+static void print_array(struct trace_iterator *iter, void *pos,
+ struct ftrace_event_field *field)
+{
+ int offset;
+ int len;
+ int i;
+
+ offset = *(int *)pos & 0xffff;
+ len = *(int *)pos >> 16;
+
+ if (field)
+ offset += field->offset + sizeof(int);
+
+ if (offset + len > iter->ent_size) {
+ trace_seq_puts(&iter->seq, "<OVERFLOW>");
+ return;
+ }
+
+ pos = (void *)iter->ent + offset;
+
+ for (i = 0; i < len; i++, pos++) {
+ if (i)
+ trace_seq_putc(&iter->seq, ',');
+ trace_seq_printf(&iter->seq, "%02x", *(unsigned char *)pos);
+ }
+}
+
+static void print_fields(struct trace_iterator *iter, struct trace_event_call *call,
+ struct list_head *head)
+{
+ struct ftrace_event_field *field;
+ int offset;
+ int len;
+ int ret;
+ void *pos;
+
+ list_for_each_entry(field, head, link) {
+ trace_seq_printf(&iter->seq, " %s=", field->name);
+ if (field->offset + field->size > iter->ent_size) {
+ trace_seq_puts(&iter->seq, "<OVERFLOW>");
+ continue;
+ }
+ pos = (void *)iter->ent + field->offset;
+
+ switch (field->filter_type) {
+ case FILTER_COMM:
+ case FILTER_STATIC_STRING:
+ trace_seq_printf(&iter->seq, "%.*s", field->size, (char *)pos);
+ break;
+ case FILTER_RDYN_STRING:
+ case FILTER_DYN_STRING:
+ offset = *(int *)pos & 0xffff;
+ len = *(int *)pos >> 16;
+
+ if (field->filter_type == FILTER_RDYN_STRING)
+ offset += field->offset + sizeof(int);
+
+ if (offset + len > iter->ent_size) {
+ trace_seq_puts(&iter->seq, "<OVERFLOW>");
+ break;
+ }
+ pos = (void *)iter->ent + offset;
+ trace_seq_printf(&iter->seq, "%.*s", len, (char *)pos);
+ break;
+ case FILTER_PTR_STRING:
+ if (!iter->fmt_size)
+ trace_iter_expand_format(iter);
+ pos = *(void **)pos;
+ ret = strncpy_from_kernel_nofault(iter->fmt, pos,
+ iter->fmt_size);
+ if (ret < 0)
+ trace_seq_printf(&iter->seq, "(0x%px)", pos);
+ else
+ trace_seq_printf(&iter->seq, "(0x%px:%s)",
+ pos, iter->fmt);
+ break;
+ case FILTER_TRACE_FN:
+ pos = *(void **)pos;
+ trace_seq_printf(&iter->seq, "%pS", pos);
+ break;
+ case FILTER_CPU:
+ case FILTER_OTHER:
+ switch (field->size) {
+ case 1:
+ if (isprint(*(char *)pos)) {
+ trace_seq_printf(&iter->seq, "'%c'",
+ *(unsigned char *)pos);
+ }
+ trace_seq_printf(&iter->seq, "(%d)",
+ *(unsigned char *)pos);
+ break;
+ case 2:
+ trace_seq_printf(&iter->seq, "0x%x (%d)",
+ *(unsigned short *)pos,
+ *(unsigned short *)pos);
+ break;
+ case 4:
+ /* dynamic array info is 4 bytes */
+ if (strstr(field->type, "__data_loc")) {
+ print_array(iter, pos, NULL);
+ break;
+ }
+
+ if (strstr(field->type, "__rel_loc")) {
+ print_array(iter, pos, field);
+ break;
+ }
+
+ trace_seq_printf(&iter->seq, "0x%x (%d)",
+ *(unsigned int *)pos,
+ *(unsigned int *)pos);
+ break;
+ case 8:
+ trace_seq_printf(&iter->seq, "0x%llx (%lld)",
+ *(unsigned long long *)pos,
+ *(unsigned long long *)pos);
+ break;
+ default:
+ trace_seq_puts(&iter->seq, "<INVALID-SIZE>");
+ break;
+ }
+ break;
+ default:
+ trace_seq_puts(&iter->seq, "<INVALID-TYPE>");
+ }
+ }
+ trace_seq_putc(&iter->seq, '\n');
+}
+
+enum print_line_t print_event_fields(struct trace_iterator *iter,
+ struct trace_event *event)
+{
+ struct trace_event_call *call;
+ struct list_head *head;
+
+ /* ftrace defined events have separate call structures */
+ if (event->type <= __TRACE_LAST_TYPE) {
+ bool found = false;
+
+ down_read(&trace_event_sem);
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (call->event.type == event->type) {
+ found = true;
+ break;
+ }
+ /* No need to search all events */
+ if (call->event.type > __TRACE_LAST_TYPE)
+ break;
+ }
+ up_read(&trace_event_sem);
+ if (!found) {
+ trace_seq_printf(&iter->seq, "UNKNOWN TYPE %d\n", event->type);
+ goto out;
+ }
+ } else {
+ call = container_of(event, struct trace_event_call, event);
+ }
+ head = trace_get_fields(call);
+
+ trace_seq_printf(&iter->seq, "%s:", trace_event_name(call));
+
+ if (head && !list_empty(head))
+ print_fields(iter, call, head);
+ else
+ trace_seq_puts(&iter->seq, "No fields found\n");
+
+ out:
+ return trace_handle_return(&iter->seq);
+}
+
enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 4c954636caf0..dca40f1f1da4 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -19,6 +19,8 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
extern void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset);
extern int trace_print_context(struct trace_iterator *iter);
extern int trace_print_lat_context(struct trace_iterator *iter);
+extern enum print_line_t print_event_fields(struct trace_iterator *iter,
+ struct trace_event *event);
extern void trace_event_read_lock(void);
extern void trace_event_read_unlock(void);
diff --git a/lib/seq_buf.c b/lib/seq_buf.c
index 0a68f7aa85d6..45c450f423fa 100644
--- a/lib/seq_buf.c
+++ b/lib/seq_buf.c
@@ -93,6 +93,38 @@ int seq_buf_printf(struct seq_buf *s, const char *fmt, ...)
}
EXPORT_SYMBOL_GPL(seq_buf_printf);
+/**
+ * seq_buf_do_printk - printk seq_buf line by line
+ * @s: seq_buf descriptor
+ * @lvl: printk level
+ *
+ * printk()-s a multi-line sequential buffer line by line. The function
+ * makes sure that the buffer in @s is nul terminated and safe to read
+ * as a string.
+ */
+void seq_buf_do_printk(struct seq_buf *s, const char *lvl)
+{
+ const char *start, *lf;
+
+ if (s->size == 0 || s->len == 0)
+ return;
+
+ seq_buf_terminate(s);
+
+ start = s->buffer;
+ while ((lf = strchr(start, '\n'))) {
+ int len = lf - start + 1;
+
+ printk("%s%.*s", lvl, len, start);
+ start = ++lf;
+ }
+
+ /* No trailing LF */
+ if (start < s->buffer + s->len)
+ printk("%s%s\n", lvl, start);
+}
+EXPORT_SYMBOL_GPL(seq_buf_do_printk);
+
#ifdef CONFIG_BINARY_PRINTF
/**
* seq_buf_bprintf - Write the printf string from binary arguments
diff --git a/lib/test_fprobe.c b/lib/test_fprobe.c
index fd6153800e56..079435a2e26c 100644
--- a/lib/test_fprobe.c
+++ b/lib/test_fprobe.c
@@ -17,8 +17,11 @@ static u32 rand1, entry_val, exit_val;
/* Use indirect calls to avoid inlining the target functions */
static u32 (*target)(u32 value);
static u32 (*target2)(u32 value);
+static u32 (*target_nest)(u32 value, u32 (*nest)(u32));
static unsigned long target_ip;
static unsigned long target2_ip;
+static unsigned long target_nest_ip;
+static int entry_return_value;
static noinline u32 fprobe_selftest_target(u32 value)
{
@@ -30,16 +33,31 @@ static noinline u32 fprobe_selftest_target2(u32 value)
return (value / div_factor) + 1;
}
-static notrace void fp_entry_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs)
+static noinline u32 fprobe_selftest_nest_target(u32 value, u32 (*nest)(u32))
+{
+ return nest(value + 2);
+}
+
+static notrace int fp_entry_handler(struct fprobe *fp, unsigned long ip,
+ struct pt_regs *regs, void *data)
{
KUNIT_EXPECT_FALSE(current_test, preemptible());
/* This can be called on the fprobe_selftest_target and the fprobe_selftest_target2 */
if (ip != target_ip)
KUNIT_EXPECT_EQ(current_test, ip, target2_ip);
entry_val = (rand1 / div_factor);
+ if (fp->entry_data_size) {
+ KUNIT_EXPECT_NOT_NULL(current_test, data);
+ if (data)
+ *(u32 *)data = entry_val;
+ } else
+ KUNIT_EXPECT_NULL(current_test, data);
+
+ return entry_return_value;
}
-static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs)
+static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip,
+ struct pt_regs *regs, void *data)
{
unsigned long ret = regs_return_value(regs);
@@ -51,6 +69,26 @@ static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip, struct
KUNIT_EXPECT_EQ(current_test, ret, (rand1 / div_factor));
KUNIT_EXPECT_EQ(current_test, entry_val, (rand1 / div_factor));
exit_val = entry_val + div_factor;
+ if (fp->entry_data_size) {
+ KUNIT_EXPECT_NOT_NULL(current_test, data);
+ if (data)
+ KUNIT_EXPECT_EQ(current_test, *(u32 *)data, entry_val);
+ } else
+ KUNIT_EXPECT_NULL(current_test, data);
+}
+
+static notrace int nest_entry_handler(struct fprobe *fp, unsigned long ip,
+ struct pt_regs *regs, void *data)
+{
+ KUNIT_EXPECT_FALSE(current_test, preemptible());
+ return 0;
+}
+
+static notrace void nest_exit_handler(struct fprobe *fp, unsigned long ip,
+ struct pt_regs *regs, void *data)
+{
+ KUNIT_EXPECT_FALSE(current_test, preemptible());
+ KUNIT_EXPECT_EQ(current_test, ip, target_nest_ip);
}
/* Test entry only (no rethook) */
@@ -132,6 +170,64 @@ static void test_fprobe_syms(struct kunit *test)
KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
}
+/* Test private entry_data */
+static void test_fprobe_data(struct kunit *test)
+{
+ struct fprobe fp = {
+ .entry_handler = fp_entry_handler,
+ .exit_handler = fp_exit_handler,
+ .entry_data_size = sizeof(u32),
+ };
+
+ current_test = test;
+ KUNIT_EXPECT_EQ(test, 0, register_fprobe(&fp, "fprobe_selftest_target", NULL));
+
+ target(rand1);
+
+ KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
+}
+
+/* Test nr_maxactive */
+static void test_fprobe_nest(struct kunit *test)
+{
+ static const char *syms[] = {"fprobe_selftest_target", "fprobe_selftest_nest_target"};
+ struct fprobe fp = {
+ .entry_handler = nest_entry_handler,
+ .exit_handler = nest_exit_handler,
+ .nr_maxactive = 1,
+ };
+
+ current_test = test;
+ KUNIT_EXPECT_EQ(test, 0, register_fprobe_syms(&fp, syms, 2));
+
+ target_nest(rand1, target);
+ KUNIT_EXPECT_EQ(test, 1, fp.nmissed);
+
+ KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
+}
+
+static void test_fprobe_skip(struct kunit *test)
+{
+ struct fprobe fp = {
+ .entry_handler = fp_entry_handler,
+ .exit_handler = fp_exit_handler,
+ };
+
+ current_test = test;
+ KUNIT_EXPECT_EQ(test, 0, register_fprobe(&fp, "fprobe_selftest_target", NULL));
+
+ entry_return_value = 1;
+ entry_val = 0;
+ exit_val = 0;
+ target(rand1);
+ KUNIT_EXPECT_NE(test, 0, entry_val);
+ KUNIT_EXPECT_EQ(test, 0, exit_val);
+ KUNIT_EXPECT_EQ(test, 0, fp.nmissed);
+ entry_return_value = 0;
+
+ KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
+}
+
static unsigned long get_ftrace_location(void *func)
{
unsigned long size, addr = (unsigned long)func;
@@ -147,8 +243,10 @@ static int fprobe_test_init(struct kunit *test)
rand1 = get_random_u32_above(div_factor);
target = fprobe_selftest_target;
target2 = fprobe_selftest_target2;
+ target_nest = fprobe_selftest_nest_target;
target_ip = get_ftrace_location(target);
target2_ip = get_ftrace_location(target2);
+ target_nest_ip = get_ftrace_location(target_nest);
return 0;
}
@@ -157,6 +255,9 @@ static struct kunit_case fprobe_testcases[] = {
KUNIT_CASE(test_fprobe_entry),
KUNIT_CASE(test_fprobe),
KUNIT_CASE(test_fprobe_syms),
+ KUNIT_CASE(test_fprobe_data),
+ KUNIT_CASE(test_fprobe_nest),
+ KUNIT_CASE(test_fprobe_skip),
{}
};
diff --git a/samples/fprobe/fprobe_example.c b/samples/fprobe/fprobe_example.c
index e22da8573116..4efc8feb6277 100644
--- a/samples/fprobe/fprobe_example.c
+++ b/samples/fprobe/fprobe_example.c
@@ -48,7 +48,8 @@ static void show_backtrace(void)
stack_trace_print(stacks, len, 24);
}
-static void sample_entry_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs)
+static int sample_entry_handler(struct fprobe *fp, unsigned long ip,
+ struct pt_regs *regs, void *data)
{
if (use_trace)
/*
@@ -61,9 +62,11 @@ static void sample_entry_handler(struct fprobe *fp, unsigned long ip, struct pt_
nhit++;
if (stackdump)
show_backtrace();
+ return 0;
}
-static void sample_exit_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs)
+static void sample_exit_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs,
+ void *data)
{
unsigned long rip = instruction_pointer(regs);
diff --git a/samples/user_events/example.c b/samples/user_events/example.c
index 18e34c9d708e..28165a096697 100644
--- a/samples/user_events/example.c
+++ b/samples/user_events/example.c
@@ -9,51 +9,28 @@
#include <errno.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
+#include <sys/uio.h>
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
-#include <asm/bitsperlong.h>
-#include <endian.h>
#include <linux/user_events.h>
-#if __BITS_PER_LONG == 64
-#define endian_swap(x) htole64(x)
-#else
-#define endian_swap(x) htole32(x)
-#endif
-
-/* Assumes debugfs is mounted */
const char *data_file = "/sys/kernel/tracing/user_events_data";
-const char *status_file = "/sys/kernel/tracing/user_events_status";
+int enabled = 0;
-static int event_status(long **status)
-{
- int fd = open(status_file, O_RDONLY);
-
- *status = mmap(NULL, sysconf(_SC_PAGESIZE), PROT_READ,
- MAP_SHARED, fd, 0);
-
- close(fd);
-
- if (*status == MAP_FAILED)
- return -1;
-
- return 0;
-}
-
-static int event_reg(int fd, const char *command, long *index, long *mask,
- int *write)
+static int event_reg(int fd, const char *command, int *write, int *enabled)
{
struct user_reg reg = {0};
reg.size = sizeof(reg);
+ reg.enable_bit = 31;
+ reg.enable_size = sizeof(*enabled);
+ reg.enable_addr = (__u64)enabled;
reg.name_args = (__u64)command;
if (ioctl(fd, DIAG_IOCSREG, &reg) == -1)
return -1;
- *index = reg.status_bit / __BITS_PER_LONG;
- *mask = endian_swap(1L << (reg.status_bit % __BITS_PER_LONG));
*write = reg.write_index;
return 0;
@@ -62,17 +39,12 @@ static int event_reg(int fd, const char *command, long *index, long *mask,
int main(int argc, char **argv)
{
int data_fd, write;
- long index, mask;
- long *status_page;
struct iovec io[2];
__u32 count = 0;
- if (event_status(&status_page) == -1)
- return errno;
-
data_fd = open(data_file, O_RDWR);
- if (event_reg(data_fd, "test u32 count", &index, &mask, &write) == -1)
+ if (event_reg(data_fd, "test u32 count", &write, &enabled) == -1)
return errno;
/* Setup iovec */
@@ -80,13 +52,12 @@ int main(int argc, char **argv)
io[0].iov_len = sizeof(write);
io[1].iov_base = &count;
io[1].iov_len = sizeof(count);
-
ask:
printf("Press enter to check status...\n");
getchar();
/* Check if anyone is listening */
- if (status_page[index] & mask) {
+ if (enabled) {
/* Yep, trace out our data */
writev(data_fd, (const struct iovec *)io, 2);
diff --git a/scripts/leaking_addresses.pl b/scripts/leaking_addresses.pl
index 8f636a23bc3f..e695634d153d 100755
--- a/scripts/leaking_addresses.pl
+++ b/scripts/leaking_addresses.pl
@@ -61,6 +61,7 @@ my @skip_abs = (
'/proc/device-tree',
'/proc/1/syscall',
'/sys/firmware/devicetree',
+ '/sys/kernel/tracing/trace_pipe',
'/sys/kernel/debug/tracing/trace_pipe',
'/sys/kernel/security/apparmor/revision');
diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c
index e30216525325..40ae6b2c7a6d 100644
--- a/scripts/recordmcount.c
+++ b/scripts/recordmcount.c
@@ -110,6 +110,7 @@ static ssize_t uwrite(void const *const buf, size_t const count)
{
size_t cnt = count;
off_t idx = 0;
+ void *p = NULL;
file_updated = 1;
@@ -117,7 +118,10 @@ static ssize_t uwrite(void const *const buf, size_t const count)
off_t aoffset = (file_ptr + count) - file_end;
if (aoffset > file_append_size) {
- file_append = realloc(file_append, aoffset);
+ p = realloc(file_append, aoffset);
+ if (!p)
+ free(file_append);
+ file_append = p;
file_append_size = aoffset;
}
if (!file_append) {
diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 6f28180ffeea..15bf00e79e3f 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -627,7 +627,7 @@ class TracepointProvider(Provider):
name)'.
All available events have directories under
- /sys/kernel/debug/tracing/events/ which export information
+ /sys/kernel/tracing/events/ which export information
about the specific event. Therefore, listing the dirs gives us
a list of all available events.
diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c
index 95f403a0c46d..0381c34fdd56 100644
--- a/tools/testing/selftests/mm/protection_keys.c
+++ b/tools/testing/selftests/mm/protection_keys.c
@@ -98,7 +98,7 @@ int tracing_root_ok(void)
void tracing_on(void)
{
#if CONTROL_TRACING > 0
-#define TRACEDIR "/sys/kernel/debug/tracing"
+#define TRACEDIR "/sys/kernel/tracing"
char pidstr[32];
if (!tracing_root_ok())
@@ -124,7 +124,7 @@ void tracing_off(void)
#if CONTROL_TRACING > 0
if (!tracing_root_ok())
return;
- cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on");
+ cat_into_file("0", "/sys/kernel/tracing/tracing_on");
#endif
}
diff --git a/tools/testing/selftests/user_events/Makefile b/tools/testing/selftests/user_events/Makefile
index 6b512b86aec3..9e95bd41b0b4 100644
--- a/tools/testing/selftests/user_events/Makefile
+++ b/tools/testing/selftests/user_events/Makefile
@@ -10,7 +10,7 @@ LDLIBS += -lrt -lpthread -lm
# This test will not compile until user_events.h is added
# back to uapi.
-TEST_GEN_PROGS = ftrace_test dyn_test perf_test
+TEST_GEN_PROGS = ftrace_test dyn_test perf_test abi_test
TEST_FILES := settings
diff --git a/tools/testing/selftests/user_events/abi_test.c b/tools/testing/selftests/user_events/abi_test.c
new file mode 100644
index 000000000000..5125c42efe65
--- /dev/null
+++ b/tools/testing/selftests/user_events/abi_test.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * User Events ABI Test Program
+ *
+ * Copyright (c) 2022 Beau Belgrave <beaub@linux.microsoft.com>
+ */
+
+#define _GNU_SOURCE
+#include <sched.h>
+
+#include <errno.h>
+#include <linux/user_events.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <asm/unistd.h>
+
+#include "../kselftest_harness.h"
+
+const char *data_file = "/sys/kernel/tracing/user_events_data";
+const char *enable_file = "/sys/kernel/tracing/events/user_events/__abi_event/enable";
+
+static int change_event(bool enable)
+{
+ int fd = open(enable_file, O_RDWR);
+ int ret;
+
+ if (fd < 0)
+ return -1;
+
+ if (enable)
+ ret = write(fd, "1", 1);
+ else
+ ret = write(fd, "0", 1);
+
+ close(fd);
+
+ if (ret == 1)
+ ret = 0;
+ else
+ ret = -1;
+
+ return ret;
+}
+
+static int reg_enable(long *enable, int size, int bit)
+{
+ struct user_reg reg = {0};
+ int fd = open(data_file, O_RDWR);
+ int ret;
+
+ if (fd < 0)
+ return -1;
+
+ reg.size = sizeof(reg);
+ reg.name_args = (__u64)"__abi_event";
+ reg.enable_bit = bit;
+ reg.enable_addr = (__u64)enable;
+ reg.enable_size = size;
+
+ ret = ioctl(fd, DIAG_IOCSREG, &reg);
+
+ close(fd);
+
+ return ret;
+}
+
+static int reg_disable(long *enable, int bit)
+{
+ struct user_unreg reg = {0};
+ int fd = open(data_file, O_RDWR);
+ int ret;
+
+ if (fd < 0)
+ return -1;
+
+ reg.size = sizeof(reg);
+ reg.disable_bit = bit;
+ reg.disable_addr = (__u64)enable;
+
+ ret = ioctl(fd, DIAG_IOCSUNREG, &reg);
+
+ close(fd);
+
+ return ret;
+}
+
+FIXTURE(user) {
+ long check;
+};
+
+FIXTURE_SETUP(user) {
+ change_event(false);
+ self->check = 0;
+}
+
+FIXTURE_TEARDOWN(user) {
+}
+
+TEST_F(user, enablement) {
+ /* Changes should be reflected immediately */
+ ASSERT_EQ(0, self->check);
+ ASSERT_EQ(0, reg_enable(&self->check, sizeof(int), 0));
+ ASSERT_EQ(0, change_event(true));
+ ASSERT_EQ(1, self->check);
+ ASSERT_EQ(0, change_event(false));
+ ASSERT_EQ(0, self->check);
+
+ /* Ensure kernel clears bit after disable */
+ ASSERT_EQ(0, change_event(true));
+ ASSERT_EQ(1, self->check);
+ ASSERT_EQ(0, reg_disable(&self->check, 0));
+ ASSERT_EQ(0, self->check);
+
+ /* Ensure doesn't change after unreg */
+ ASSERT_EQ(0, change_event(true));
+ ASSERT_EQ(0, self->check);
+ ASSERT_EQ(0, change_event(false));
+}
+
+TEST_F(user, bit_sizes) {
+ /* Allow 0-31 bits for 32-bit */
+ ASSERT_EQ(0, reg_enable(&self->check, sizeof(int), 0));
+ ASSERT_EQ(0, reg_enable(&self->check, sizeof(int), 31));
+ ASSERT_NE(0, reg_enable(&self->check, sizeof(int), 32));
+ ASSERT_EQ(0, reg_disable(&self->check, 0));
+ ASSERT_EQ(0, reg_disable(&self->check, 31));
+
+#if BITS_PER_LONG == 8
+ /* Allow 0-64 bits for 64-bit */
+ ASSERT_EQ(0, reg_enable(&self->check, sizeof(long), 63));
+ ASSERT_NE(0, reg_enable(&self->check, sizeof(long), 64));
+ ASSERT_EQ(0, reg_disable(&self->check, 63));
+#endif
+
+ /* Disallowed sizes (everything beside 4 and 8) */
+ ASSERT_NE(0, reg_enable(&self->check, 1, 0));
+ ASSERT_NE(0, reg_enable(&self->check, 2, 0));
+ ASSERT_NE(0, reg_enable(&self->check, 3, 0));
+ ASSERT_NE(0, reg_enable(&self->check, 5, 0));
+ ASSERT_NE(0, reg_enable(&self->check, 6, 0));
+ ASSERT_NE(0, reg_enable(&self->check, 7, 0));
+ ASSERT_NE(0, reg_enable(&self->check, 9, 0));
+ ASSERT_NE(0, reg_enable(&self->check, 128, 0));
+}
+
+TEST_F(user, forks) {
+ int i;
+
+ /* Ensure COW pages get updated after fork */
+ ASSERT_EQ(0, reg_enable(&self->check, sizeof(int), 0));
+ ASSERT_EQ(0, self->check);
+
+ if (fork() == 0) {
+ /* Force COW */
+ self->check = 0;
+
+ /* Up to 1 sec for enablement */
+ for (i = 0; i < 10; ++i) {
+ usleep(100000);
+
+ if (self->check)
+ exit(0);
+ }
+
+ exit(1);
+ }
+
+ /* Allow generous time for COW, then enable */
+ usleep(100000);
+ ASSERT_EQ(0, change_event(true));
+
+ ASSERT_NE(-1, wait(&i));
+ ASSERT_EQ(0, WEXITSTATUS(i));
+
+ /* Ensure child doesn't disable parent */
+ if (fork() == 0)
+ exit(reg_disable(&self->check, 0));
+
+ ASSERT_NE(-1, wait(&i));
+ ASSERT_EQ(0, WEXITSTATUS(i));
+ ASSERT_EQ(1, self->check);
+ ASSERT_EQ(0, change_event(false));
+ ASSERT_EQ(0, self->check);
+}
+
+/* Waits up to 1 sec for enablement */
+static int clone_check(void *check)
+{
+ int i;
+
+ for (i = 0; i < 10; ++i) {
+ usleep(100000);
+
+ if (*(long *)check)
+ return 0;
+ }
+
+ return 1;
+}
+
+TEST_F(user, clones) {
+ int i, stack_size = 4096;
+ void *stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK,
+ -1, 0);
+
+ ASSERT_NE(MAP_FAILED, stack);
+ ASSERT_EQ(0, reg_enable(&self->check, sizeof(int), 0));
+ ASSERT_EQ(0, self->check);
+
+ /* Shared VM should see enablements */
+ ASSERT_NE(-1, clone(&clone_check, stack + stack_size,
+ CLONE_VM | SIGCHLD, &self->check));
+
+ ASSERT_EQ(0, change_event(true));
+ ASSERT_NE(-1, wait(&i));
+ ASSERT_EQ(0, WEXITSTATUS(i));
+ munmap(stack, stack_size);
+ ASSERT_EQ(0, change_event(false));
+}
+
+int main(int argc, char **argv)
+{
+ return test_harness_run(argc, argv);
+}
diff --git a/tools/testing/selftests/user_events/dyn_test.c b/tools/testing/selftests/user_events/dyn_test.c
index d6265d14cd51..8879a7b04c6a 100644
--- a/tools/testing/selftests/user_events/dyn_test.c
+++ b/tools/testing/selftests/user_events/dyn_test.c
@@ -16,7 +16,7 @@
#include "../kselftest_harness.h"
-const char *dyn_file = "/sys/kernel/debug/tracing/dynamic_events";
+const char *dyn_file = "/sys/kernel/tracing/dynamic_events";
const char *clear = "!u:__test_event";
static int Append(const char *value)
diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c
index 404a2713dcae..7c99cef94a65 100644
--- a/tools/testing/selftests/user_events/ftrace_test.c
+++ b/tools/testing/selftests/user_events/ftrace_test.c
@@ -12,20 +12,16 @@
#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
+#include <sys/uio.h>
#include <unistd.h>
#include "../kselftest_harness.h"
-const char *data_file = "/sys/kernel/debug/tracing/user_events_data";
-const char *status_file = "/sys/kernel/debug/tracing/user_events_status";
-const char *enable_file = "/sys/kernel/debug/tracing/events/user_events/__test_event/enable";
-const char *trace_file = "/sys/kernel/debug/tracing/trace";
-const char *fmt_file = "/sys/kernel/debug/tracing/events/user_events/__test_event/format";
-
-static inline int status_check(char *status_page, int status_bit)
-{
- return status_page[status_bit >> 3] & (1 << (status_bit & 7));
-}
+const char *data_file = "/sys/kernel/tracing/user_events_data";
+const char *status_file = "/sys/kernel/tracing/user_events_status";
+const char *enable_file = "/sys/kernel/tracing/events/user_events/__test_event/enable";
+const char *trace_file = "/sys/kernel/tracing/trace";
+const char *fmt_file = "/sys/kernel/tracing/events/user_events/__test_event/format";
static int trace_bytes(void)
{
@@ -106,13 +102,23 @@ err:
return -1;
}
-static int clear(void)
+static int clear(int *check)
{
+ struct user_unreg unreg = {0};
+
+ unreg.size = sizeof(unreg);
+ unreg.disable_bit = 31;
+ unreg.disable_addr = (__u64)check;
+
int fd = open(data_file, O_RDWR);
if (fd == -1)
return -1;
+ if (ioctl(fd, DIAG_IOCSUNREG, &unreg) == -1)
+ if (errno != ENOENT)
+ return -1;
+
if (ioctl(fd, DIAG_IOCSDEL, "__test_event") == -1)
if (errno != ENOENT)
return -1;
@@ -122,7 +128,7 @@ static int clear(void)
return 0;
}
-static int check_print_fmt(const char *event, const char *expected)
+static int check_print_fmt(const char *event, const char *expected, int *check)
{
struct user_reg reg = {0};
char print_fmt[256];
@@ -130,7 +136,7 @@ static int check_print_fmt(const char *event, const char *expected)
int fd;
/* Ensure cleared */
- ret = clear();
+ ret = clear(check);
if (ret != 0)
return ret;
@@ -142,14 +148,19 @@ static int check_print_fmt(const char *event, const char *expected)
reg.size = sizeof(reg);
reg.name_args = (__u64)event;
+ reg.enable_bit = 31;
+ reg.enable_addr = (__u64)check;
+ reg.enable_size = sizeof(*check);
/* Register should work */
ret = ioctl(fd, DIAG_IOCSREG, &reg);
close(fd);
- if (ret != 0)
+ if (ret != 0) {
+ printf("Reg failed in fmt\n");
return ret;
+ }
/* Ensure correct print_fmt */
ret = get_print_fmt(print_fmt, sizeof(print_fmt));
@@ -164,6 +175,7 @@ FIXTURE(user) {
int status_fd;
int data_fd;
int enable_fd;
+ int check;
};
FIXTURE_SETUP(user) {
@@ -185,59 +197,63 @@ FIXTURE_TEARDOWN(user) {
close(self->enable_fd);
}
- ASSERT_EQ(0, clear());
+ if (clear(&self->check) != 0)
+ printf("WARNING: Clear didn't work!\n");
}
TEST_F(user, register_events) {
struct user_reg reg = {0};
- int page_size = sysconf(_SC_PAGESIZE);
- char *status_page;
+ struct user_unreg unreg = {0};
reg.size = sizeof(reg);
reg.name_args = (__u64)"__test_event u32 field1; u32 field2";
+ reg.enable_bit = 31;
+ reg.enable_addr = (__u64)&self->check;
+ reg.enable_size = sizeof(self->check);
- status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED,
- self->status_fd, 0);
+ unreg.size = sizeof(unreg);
+ unreg.disable_bit = 31;
+ unreg.disable_addr = (__u64)&self->check;
/* Register should work */
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
ASSERT_EQ(0, reg.write_index);
- ASSERT_NE(0, reg.status_bit);
- /* Multiple registers should result in same index */
+ /* Multiple registers to the same addr + bit should fail */
+ ASSERT_EQ(-1, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
+ ASSERT_EQ(EADDRINUSE, errno);
+
+ /* Multiple registers to same name should result in same index */
+ reg.enable_bit = 30;
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
ASSERT_EQ(0, reg.write_index);
- ASSERT_NE(0, reg.status_bit);
/* Ensure disabled */
self->enable_fd = open(enable_file, O_RDWR);
ASSERT_NE(-1, self->enable_fd);
ASSERT_NE(-1, write(self->enable_fd, "0", sizeof("0")))
- /* MMAP should work and be zero'd */
- ASSERT_NE(MAP_FAILED, status_page);
- ASSERT_NE(NULL, status_page);
- ASSERT_EQ(0, status_check(status_page, reg.status_bit));
-
/* Enable event and ensure bits updated in status */
ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1")))
- ASSERT_NE(0, status_check(status_page, reg.status_bit));
+ ASSERT_EQ(1 << reg.enable_bit, self->check);
/* Disable event and ensure bits updated in status */
ASSERT_NE(-1, write(self->enable_fd, "0", sizeof("0")))
- ASSERT_EQ(0, status_check(status_page, reg.status_bit));
+ ASSERT_EQ(0, self->check);
/* File still open should return -EBUSY for delete */
ASSERT_EQ(-1, ioctl(self->data_fd, DIAG_IOCSDEL, "__test_event"));
ASSERT_EQ(EBUSY, errno);
- /* Delete should work only after close */
+ /* Unregister */
+ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, &unreg));
+ unreg.disable_bit = 30;
+ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, &unreg));
+
+ /* Delete should work only after close and unregister */
close(self->data_fd);
self->data_fd = open(data_file, O_RDWR);
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSDEL, "__test_event"));
-
- /* Unmap should work */
- ASSERT_EQ(0, munmap(status_page, page_size));
}
TEST_F(user, write_events) {
@@ -245,11 +261,12 @@ TEST_F(user, write_events) {
struct iovec io[3];
__u32 field1, field2;
int before = 0, after = 0;
- int page_size = sysconf(_SC_PAGESIZE);
- char *status_page;
reg.size = sizeof(reg);
reg.name_args = (__u64)"__test_event u32 field1; u32 field2";
+ reg.enable_bit = 31;
+ reg.enable_addr = (__u64)&self->check;
+ reg.enable_size = sizeof(self->check);
field1 = 1;
field2 = 2;
@@ -261,18 +278,10 @@ TEST_F(user, write_events) {
io[2].iov_base = &field2;
io[2].iov_len = sizeof(field2);
- status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED,
- self->status_fd, 0);
-
/* Register should work */
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
ASSERT_EQ(0, reg.write_index);
- ASSERT_NE(0, reg.status_bit);
-
- /* MMAP should work and be zero'd */
- ASSERT_NE(MAP_FAILED, status_page);
- ASSERT_NE(NULL, status_page);
- ASSERT_EQ(0, status_check(status_page, reg.status_bit));
+ ASSERT_EQ(0, self->check);
/* Write should fail on invalid slot with ENOENT */
io[0].iov_base = &field2;
@@ -287,13 +296,18 @@ TEST_F(user, write_events) {
ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1")))
/* Event should now be enabled */
- ASSERT_NE(0, status_check(status_page, reg.status_bit));
+ ASSERT_NE(1 << reg.enable_bit, self->check);
/* Write should make it out to ftrace buffers */
before = trace_bytes();
ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 3));
after = trace_bytes();
ASSERT_GT(after, before);
+
+ /* Negative index should fail with EINVAL */
+ reg.write_index = -1;
+ ASSERT_EQ(-1, writev(self->data_fd, (const struct iovec *)io, 3));
+ ASSERT_EQ(EINVAL, errno);
}
TEST_F(user, write_fault) {
@@ -304,6 +318,9 @@ TEST_F(user, write_fault) {
reg.size = sizeof(reg);
reg.name_args = (__u64)"__test_event u64 anon";
+ reg.enable_bit = 31;
+ reg.enable_addr = (__u64)&self->check;
+ reg.enable_size = sizeof(self->check);
anon = mmap(NULL, l, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
ASSERT_NE(MAP_FAILED, anon);
@@ -316,7 +333,6 @@ TEST_F(user, write_fault) {
/* Register should work */
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
ASSERT_EQ(0, reg.write_index);
- ASSERT_NE(0, reg.status_bit);
/* Write should work normally */
ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 2));
@@ -333,24 +349,17 @@ TEST_F(user, write_validator) {
int loc, bytes;
char data[8];
int before = 0, after = 0;
- int page_size = sysconf(_SC_PAGESIZE);
- char *status_page;
-
- status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED,
- self->status_fd, 0);
reg.size = sizeof(reg);
reg.name_args = (__u64)"__test_event __rel_loc char[] data";
+ reg.enable_bit = 31;
+ reg.enable_addr = (__u64)&self->check;
+ reg.enable_size = sizeof(self->check);
/* Register should work */
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
ASSERT_EQ(0, reg.write_index);
- ASSERT_NE(0, reg.status_bit);
-
- /* MMAP should work and be zero'd */
- ASSERT_NE(MAP_FAILED, status_page);
- ASSERT_NE(NULL, status_page);
- ASSERT_EQ(0, status_check(status_page, reg.status_bit));
+ ASSERT_EQ(0, self->check);
io[0].iov_base = &reg.write_index;
io[0].iov_len = sizeof(reg.write_index);
@@ -369,7 +378,7 @@ TEST_F(user, write_validator) {
ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1")))
/* Event should now be enabled */
- ASSERT_NE(0, status_check(status_page, reg.status_bit));
+ ASSERT_EQ(1 << reg.enable_bit, self->check);
/* Full in-bounds write should work */
before = trace_bytes();
@@ -409,71 +418,88 @@ TEST_F(user, print_fmt) {
int ret;
ret = check_print_fmt("__test_event __rel_loc char[] data",
- "print fmt: \"data=%s\", __get_rel_str(data)");
+ "print fmt: \"data=%s\", __get_rel_str(data)",
+ &self->check);
ASSERT_EQ(0, ret);
ret = check_print_fmt("__test_event __data_loc char[] data",
- "print fmt: \"data=%s\", __get_str(data)");
+ "print fmt: \"data=%s\", __get_str(data)",
+ &self->check);
ASSERT_EQ(0, ret);
ret = check_print_fmt("__test_event s64 data",
- "print fmt: \"data=%lld\", REC->data");
+ "print fmt: \"data=%lld\", REC->data",
+ &self->check);
ASSERT_EQ(0, ret);
ret = check_print_fmt("__test_event u64 data",
- "print fmt: \"data=%llu\", REC->data");
+ "print fmt: \"data=%llu\", REC->data",
+ &self->check);
ASSERT_EQ(0, ret);
ret = check_print_fmt("__test_event s32 data",
- "print fmt: \"data=%d\", REC->data");
+ "print fmt: \"data=%d\", REC->data",
+ &self->check);
ASSERT_EQ(0, ret);
ret = check_print_fmt("__test_event u32 data",
- "print fmt: \"data=%u\", REC->data");
+ "print fmt: \"data=%u\", REC->data",
+ &self->check);
ASSERT_EQ(0, ret);
ret = check_print_fmt("__test_event int data",
- "print fmt: \"data=%d\", REC->data");
+ "print fmt: \"data=%d\", REC->data",
+ &self->check);
ASSERT_EQ(0, ret);
ret = check_print_fmt("__test_event unsigned int data",
- "print fmt: \"data=%u\", REC->data");
+ "print fmt: \"data=%u\", REC->data",
+ &self->check);
ASSERT_EQ(0, ret);
ret = check_print_fmt("__test_event s16 data",
- "print fmt: \"data=%d\", REC->data");
+ "print fmt: \"data=%d\", REC->data",
+ &self->check);
ASSERT_EQ(0, ret);
ret = check_print_fmt("__test_event u16 data",
- "print fmt: \"data=%u\", REC->data");
+ "print fmt: \"data=%u\", REC->data",
+ &self->check);
ASSERT_EQ(0, ret);
ret = check_print_fmt("__test_event short data",
- "print fmt: \"data=%d\", REC->data");
+ "print fmt: \"data=%d\", REC->data",
+ &self->check);
ASSERT_EQ(0, ret);
ret = check_print_fmt("__test_event unsigned short data",
- "print fmt: \"data=%u\", REC->data");
+ "print fmt: \"data=%u\", REC->data",
+ &self->check);
ASSERT_EQ(0, ret);
ret = check_print_fmt("__test_event s8 data",
- "print fmt: \"data=%d\", REC->data");
+ "print fmt: \"data=%d\", REC->data",
+ &self->check);
ASSERT_EQ(0, ret);
ret = check_print_fmt("__test_event u8 data",
- "print fmt: \"data=%u\", REC->data");
+ "print fmt: \"data=%u\", REC->data",
+ &self->check);
ASSERT_EQ(0, ret);
ret = check_print_fmt("__test_event char data",
- "print fmt: \"data=%d\", REC->data");
+ "print fmt: \"data=%d\", REC->data",
+ &self->check);
ASSERT_EQ(0, ret);
ret = check_print_fmt("__test_event unsigned char data",
- "print fmt: \"data=%u\", REC->data");
+ "print fmt: \"data=%u\", REC->data",
+ &self->check);
ASSERT_EQ(0, ret);
ret = check_print_fmt("__test_event char[4] data",
- "print fmt: \"data=%s\", REC->data");
+ "print fmt: \"data=%s\", REC->data",
+ &self->check);
ASSERT_EQ(0, ret);
}
diff --git a/tools/testing/selftests/user_events/perf_test.c b/tools/testing/selftests/user_events/perf_test.c
index 8b4c7879d5a7..a070258d4449 100644
--- a/tools/testing/selftests/user_events/perf_test.c
+++ b/tools/testing/selftests/user_events/perf_test.c
@@ -18,10 +18,9 @@
#include "../kselftest_harness.h"
-const char *data_file = "/sys/kernel/debug/tracing/user_events_data";
-const char *status_file = "/sys/kernel/debug/tracing/user_events_status";
-const char *id_file = "/sys/kernel/debug/tracing/events/user_events/__test_event/id";
-const char *fmt_file = "/sys/kernel/debug/tracing/events/user_events/__test_event/format";
+const char *data_file = "/sys/kernel/tracing/user_events_data";
+const char *id_file = "/sys/kernel/tracing/events/user_events/__test_event/id";
+const char *fmt_file = "/sys/kernel/tracing/events/user_events/__test_event/format";
struct event {
__u32 index;
@@ -35,11 +34,6 @@ static long perf_event_open(struct perf_event_attr *pe, pid_t pid,
return syscall(__NR_perf_event_open, pe, pid, cpu, group_fd, flags);
}
-static inline int status_check(char *status_page, int status_bit)
-{
- return status_page[status_bit >> 3] & (1 << (status_bit & 7));
-}
-
static int get_id(void)
{
FILE *fp = fopen(id_file, "r");
@@ -88,45 +82,38 @@ static int get_offset(void)
}
FIXTURE(user) {
- int status_fd;
int data_fd;
+ int check;
};
FIXTURE_SETUP(user) {
- self->status_fd = open(status_file, O_RDONLY);
- ASSERT_NE(-1, self->status_fd);
-
self->data_fd = open(data_file, O_RDWR);
ASSERT_NE(-1, self->data_fd);
}
FIXTURE_TEARDOWN(user) {
- close(self->status_fd);
close(self->data_fd);
}
TEST_F(user, perf_write) {
struct perf_event_attr pe = {0};
struct user_reg reg = {0};
- int page_size = sysconf(_SC_PAGESIZE);
- char *status_page;
struct event event;
struct perf_event_mmap_page *perf_page;
+ int page_size = sysconf(_SC_PAGESIZE);
int id, fd, offset;
__u32 *val;
reg.size = sizeof(reg);
reg.name_args = (__u64)"__test_event u32 field1; u32 field2";
-
- status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED,
- self->status_fd, 0);
- ASSERT_NE(MAP_FAILED, status_page);
+ reg.enable_bit = 31;
+ reg.enable_addr = (__u64)&self->check;
+ reg.enable_size = sizeof(self->check);
/* Register should work */
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
ASSERT_EQ(0, reg.write_index);
- ASSERT_NE(0, reg.status_bit);
- ASSERT_EQ(0, status_check(status_page, reg.status_bit));
+ ASSERT_EQ(0, self->check);
/* Id should be there */
id = get_id();
@@ -149,7 +136,7 @@ TEST_F(user, perf_write) {
ASSERT_NE(MAP_FAILED, perf_page);
/* Status should be updated */
- ASSERT_NE(0, status_check(status_page, reg.status_bit));
+ ASSERT_EQ(1 << reg.enable_bit, self->check);
event.index = reg.write_index;
event.field1 = 0xc001;
@@ -165,6 +152,12 @@ TEST_F(user, perf_write) {
/* Ensure correct */
ASSERT_EQ(event.field1, *val++);
ASSERT_EQ(event.field2, *val++);
+
+ munmap(perf_page, page_size * 2);
+ close(fd);
+
+ /* Status should be updated */
+ ASSERT_EQ(0, self->check);
}
int main(int argc, char **argv)