summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/audit_fsnotify.c22
-rw-r--r--kernel/audit_tree.c10
-rw-r--r--kernel/audit_watch.c19
-rw-r--r--kernel/crash_core.c50
-rw-r--r--kernel/events/callchain.c5
-rw-r--r--kernel/events/core.c5
-rw-r--r--kernel/events/uprobes.c8
-rw-r--r--kernel/exit.c18
-rw-r--r--kernel/fork.c56
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/irq/manage.c6
-rw-r--r--kernel/kcov.c6
-rw-r--r--kernel/kexec_file.c18
-rw-r--r--kernel/kmod.c5
-rw-r--r--kernel/kprobes.c24
-rw-r--r--kernel/kthread.c13
-rw-r--r--kernel/locking/lockdep_proc.c2
-rw-r--r--kernel/locking/locktorture.c10
-rw-r--r--kernel/locking/qspinlock.c7
-rw-r--r--kernel/module.c82
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/power/hibernate.c97
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/rcu/rcuperf.c8
-rw-r--r--kernel/rcu/rcutorture.c7
-rw-r--r--kernel/rcu/tree.c2
-rw-r--r--kernel/regset.c76
-rw-r--r--kernel/sched/core.c50
-rw-r--r--kernel/sched/psi.c5
-rw-r--r--kernel/scs.c2
-rw-r--r--kernel/stacktrace.c5
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/time/hrtimer.c13
-rw-r--r--kernel/time/timekeeping.c19
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/ftrace.c34
-rw-r--r--kernel/trace/ring_buffer.c694
-rw-r--r--kernel/trace/ring_buffer_benchmark.c48
-rw-r--r--kernel/trace/trace.c83
-rw-r--r--kernel/trace/trace.h9
-rw-r--r--kernel/trace/trace_events.c4
-rw-r--r--kernel/trace/trace_hwlat.c6
-rw-r--r--kernel/trace/trace_output.c14
-rw-r--r--kernel/trace/trace_uprobe.c1
-rw-r--r--kernel/umh.c29
46 files changed, 1184 insertions, 415 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index bdeb77e27042..b3da548691c9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o smpboot.o ucount.o
+ async.o range.o smpboot.o ucount.o regset.o
obj-$(CONFIG_BPFILTER) += usermode_driver.o
obj-$(CONFIG_MODULES) += kmod.o
@@ -36,7 +36,7 @@ KCOV_INSTRUMENT_stacktrace.o := n
KCOV_INSTRUMENT_kcov.o := n
KASAN_SANITIZE_kcov.o := n
KCSAN_SANITIZE_kcov.o := n
-CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
# cond_syscall is currently not LTO compatible
CFLAGS_sys_ni.o = $(DISABLE_LTO)
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 3596448bfdab..bfcfcd61adb6 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -36,7 +36,7 @@ static struct fsnotify_group *audit_fsnotify_group;
/* fsnotify events we care about. */
#define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
- FS_MOVE_SELF | FS_EVENT_ON_CHILD)
+ FS_MOVE_SELF)
static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark)
{
@@ -152,35 +152,31 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
}
/* Update mark data in audit rules based on fsnotify events. */
-static int audit_mark_handle_event(struct fsnotify_group *group,
- struct inode *to_tell,
- u32 mask, const void *data, int data_type,
- const struct qstr *dname, u32 cookie,
- struct fsnotify_iter_info *iter_info)
+static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *dname)
{
- struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
struct audit_fsnotify_mark *audit_mark;
- const struct inode *inode = fsnotify_data_inode(data, data_type);
audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
- BUG_ON(group != audit_fsnotify_group);
-
- if (WARN_ON(!inode))
+ if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group) ||
+ WARN_ON_ONCE(!inode))
return 0;
if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL))
return 0;
audit_update_mark(audit_mark, inode);
- } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
+ } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) {
audit_autoremove_mark_rule(audit_mark);
+ }
return 0;
}
static const struct fsnotify_ops audit_mark_fsnotify_ops = {
- .handle_event = audit_mark_handle_event,
+ .handle_inode_event = audit_mark_handle_event,
.free_mark = audit_fsnotify_free_mark,
};
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1b7a2f041793..83e1c07fc99e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1035,11 +1035,9 @@ static void evict_chunk(struct audit_chunk *chunk)
audit_schedule_prune();
}
-static int audit_tree_handle_event(struct fsnotify_group *group,
- struct inode *to_tell,
- u32 mask, const void *data, int data_type,
- const struct qstr *file_name, u32 cookie,
- struct fsnotify_iter_info *iter_info)
+static int audit_tree_handle_event(struct fsnotify_mark *mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *file_name)
{
return 0;
}
@@ -1068,7 +1066,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *mark,
}
static const struct fsnotify_ops audit_tree_ops = {
- .handle_event = audit_tree_handle_event,
+ .handle_inode_event = audit_tree_handle_event,
.freeing_mark = audit_tree_freeing_mark,
.free_mark = audit_tree_destroy_watch,
};
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index e09c551ae52d..246e5ba704c0 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -53,7 +53,7 @@ static struct fsnotify_group *audit_watch_group;
/* fsnotify events we care about. */
#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
- FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT)
+ FS_MOVE_SELF | FS_UNMOUNT)
static void audit_free_parent(struct audit_parent *parent)
{
@@ -464,20 +464,17 @@ void audit_remove_watch_rule(struct audit_krule *krule)
}
/* Update watch data in audit rules based on fsnotify events. */
-static int audit_watch_handle_event(struct fsnotify_group *group,
- struct inode *to_tell,
- u32 mask, const void *data, int data_type,
- const struct qstr *dname, u32 cookie,
- struct fsnotify_iter_info *iter_info)
+static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *dname)
{
- struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
- const struct inode *inode = fsnotify_data_inode(data, data_type);
struct audit_parent *parent;
parent = container_of(inode_mark, struct audit_parent, mark);
- BUG_ON(group != audit_watch_group);
- WARN_ON(!inode);
+ if (WARN_ON_ONCE(inode_mark->group != audit_watch_group) ||
+ WARN_ON_ONCE(!inode))
+ return 0;
if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
@@ -490,7 +487,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
}
static const struct fsnotify_ops audit_watch_fsnotify_ops = {
- .handle_event = audit_watch_handle_event,
+ .handle_inode_event = audit_watch_handle_event,
.free_mark = audit_watch_free_mark,
};
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 18175687133a..106e4500fd53 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -11,6 +11,8 @@
#include <asm/page.h>
#include <asm/sections.h>
+#include <crypto/sha.h>
+
/* vmcoreinfo stuff */
unsigned char *vmcoreinfo_data;
size_t vmcoreinfo_size;
@@ -376,6 +378,53 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
}
EXPORT_SYMBOL(paddr_vmcoreinfo_note);
+#define NOTES_SIZE (&__stop_notes - &__start_notes)
+#define BUILD_ID_MAX SHA1_DIGEST_SIZE
+#define NT_GNU_BUILD_ID 3
+
+struct elf_note_section {
+ struct elf_note n_hdr;
+ u8 n_data[];
+};
+
+/*
+ * Add build ID from .notes section as generated by the GNU ld(1)
+ * or LLVM lld(1) --build-id option.
+ */
+static void add_build_id_vmcoreinfo(void)
+{
+ char build_id[BUILD_ID_MAX * 2 + 1];
+ int n_remain = NOTES_SIZE;
+
+ while (n_remain >= sizeof(struct elf_note)) {
+ const struct elf_note_section *note_sec =
+ &__start_notes + NOTES_SIZE - n_remain;
+ const u32 n_namesz = note_sec->n_hdr.n_namesz;
+
+ if (note_sec->n_hdr.n_type == NT_GNU_BUILD_ID &&
+ n_namesz != 0 &&
+ !strcmp((char *)&note_sec->n_data[0], "GNU")) {
+ if (note_sec->n_hdr.n_descsz <= BUILD_ID_MAX) {
+ const u32 n_descsz = note_sec->n_hdr.n_descsz;
+ const u8 *s = &note_sec->n_data[n_namesz];
+
+ s = PTR_ALIGN(s, 4);
+ bin2hex(build_id, s, n_descsz);
+ build_id[2 * n_descsz] = '\0';
+ VMCOREINFO_BUILD_ID(build_id);
+ return;
+ }
+ pr_warn("Build ID is too large to include in vmcoreinfo: %u > %u\n",
+ note_sec->n_hdr.n_descsz,
+ BUILD_ID_MAX);
+ return;
+ }
+ n_remain -= sizeof(struct elf_note) +
+ ALIGN(note_sec->n_hdr.n_namesz, 4) +
+ ALIGN(note_sec->n_hdr.n_descsz, 4);
+ }
+}
+
static int __init crash_save_vmcoreinfo_init(void)
{
vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
@@ -394,6 +443,7 @@ static int __init crash_save_vmcoreinfo_init(void)
}
VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+ add_build_id_vmcoreinfo();
VMCOREINFO_PAGESIZE(PAGE_SIZE);
VMCOREINFO_SYMBOL(init_uts_ns);
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c6ce894e4ce9..58cbe357fb2b 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -217,10 +217,9 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
- fs = get_fs();
- set_fs(USER_DS);
+ fs = force_uaccess_begin();
perf_callchain_user(&ctx, regs);
- set_fs(fs);
+ force_uaccess_end(fs);
}
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d1f0a7e5b182..6961333ebad5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6453,10 +6453,9 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
/* Data. */
sp = perf_user_stack_pointer(regs);
- fs = get_fs();
- set_fs(USER_DS);
+ fs = force_uaccess_begin();
rem = __output_copy_user(handle, (void *) sp, dump_size);
- set_fs(fs);
+ force_uaccess_end(fs);
dyn_size = dump_size - rem;
perf_output_skip(handle, rem);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 25de10c904e6..649fd53dc9ad 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
if (new_page) {
get_page(new_page);
page_add_new_anon_rmap(new_page, vma, addr, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ lru_cache_add_inactive_or_unevictable(new_page, vma);
} else
/* no new page, just dec_mm_counter for old_page */
dec_mm_counter(mm, MM_ANONPAGES);
@@ -376,7 +376,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
if (!vaddr || !d)
return -EINVAL;
- ret = get_user_pages_remote(NULL, mm, vaddr, 1,
+ ret = get_user_pages_remote(mm, vaddr, 1,
FOLL_WRITE, &page, &vma, NULL);
if (unlikely(ret <= 0)) {
/*
@@ -477,7 +477,7 @@ retry:
if (is_register)
gup_flags |= FOLL_SPLIT_PMD;
/* Read the page with vaddr into memory */
- ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags,
+ ret = get_user_pages_remote(mm, vaddr, 1, gup_flags,
&old_page, &vma, NULL);
if (ret <= 0)
return ret;
@@ -2029,7 +2029,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
* but we treat this as a 'remote' access since it is
* essentially a kernel access to the memory.
*/
- result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
+ result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page,
NULL, NULL);
if (result < 0)
return result;
diff --git a/kernel/exit.c b/kernel/exit.c
index e731c414e024..733e80f334e7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -732,7 +732,7 @@ void __noreturn do_exit(long code)
* mm_release()->clear_child_tid() from writing to a user-controlled
* kernel address.
*/
- set_fs(USER_DS);
+ force_uaccess_begin();
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
@@ -1626,6 +1626,22 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
return ret;
}
+int kernel_wait(pid_t pid, int *stat)
+{
+ struct wait_opts wo = {
+ .wo_type = PIDTYPE_PID,
+ .wo_pid = find_get_pid(pid),
+ .wo_flags = WEXITED,
+ };
+ int ret;
+
+ ret = do_wait(&wo);
+ if (ret > 0 && wo.wo_stat)
+ *stat = wo.wo_stat;
+ put_pid(wo.wo_pid);
+ return ret;
+}
+
SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
int, options, struct rusage __user *, ru)
{
diff --git a/kernel/fork.c b/kernel/fork.c
index 76d3f3387554..4d32190861bd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -261,7 +261,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
THREAD_SIZE_ORDER);
if (likely(page)) {
- tsk->stack = page_address(page);
+ tsk->stack = kasan_reset_tag(page_address(page));
return tsk->stack;
}
return NULL;
@@ -276,13 +276,8 @@ static inline void free_thread_stack(struct task_struct *tsk)
if (vm) {
int i;
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- mod_memcg_page_state(vm->pages[i],
- MEMCG_KERNEL_STACK_KB,
- -(int)(PAGE_SIZE / 1024));
-
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
memcg_kmem_uncharge_page(vm->pages[i], 0);
- }
for (i = 0; i < NR_CACHED_STACKS; i++) {
if (this_cpu_cmpxchg(cached_stacks[i],
@@ -307,6 +302,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
{
unsigned long *stack;
stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
+ stack = kasan_reset_tag(stack);
tsk->stack = stack;
return stack;
}
@@ -382,31 +378,14 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
void *stack = task_stack_page(tsk);
struct vm_struct *vm = task_stack_vm_area(tsk);
- BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
-
- if (vm) {
- int i;
-
- BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- mod_zone_page_state(page_zone(vm->pages[i]),
- NR_KERNEL_STACK_KB,
- PAGE_SIZE / 1024 * account);
- }
- } else {
- /*
- * All stack pages are in the same zone and belong to the
- * same memcg.
- */
- struct page *first_page = virt_to_page(stack);
-
- mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
- THREAD_SIZE / 1024 * account);
-
- mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
- }
+ /* All stack pages are in the same node. */
+ if (vm)
+ mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
+ else
+ mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
}
static int memcg_charge_kernel_stack(struct task_struct *tsk)
@@ -415,24 +394,23 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk)
struct vm_struct *vm = task_stack_vm_area(tsk);
int ret;
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+
if (vm) {
int i;
+ BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
/*
* If memcg_kmem_charge_page() fails, page->mem_cgroup
- * pointer is NULL, and both memcg_kmem_uncharge_page()
- * and mod_memcg_page_state() in free_thread_stack()
- * will ignore this page. So it's safe.
+ * pointer is NULL, and memcg_kmem_uncharge_page() in
+ * free_thread_stack() will ignore this page.
*/
ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
0);
if (ret)
return ret;
-
- mod_memcg_page_state(vm->pages[i],
- MEMCG_KERNEL_STACK_KB,
- PAGE_SIZE / 1024);
}
}
#endif
@@ -2033,7 +2011,7 @@ static __latent_entropy struct task_struct *copy_process(
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
- seqcount_init(&p->mems_allowed_seq);
+ seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
memset(&p->irqtrace, 0, sizeof(p->irqtrace));
diff --git a/kernel/futex.c b/kernel/futex.c
index 83404124b77b..61e8153e6c76 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -678,7 +678,7 @@ static int fault_in_user_writeable(u32 __user *uaddr)
int ret;
mmap_read_lock(mm);
- ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
+ ret = fixup_user_fault(mm, (unsigned long)uaddr,
FAULT_FLAG_WRITE, NULL);
mmap_read_unlock(mm);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 48c38e09c673..d55ba625d426 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1308,9 +1308,6 @@ static int
setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
{
struct task_struct *t;
- struct sched_param param = {
- .sched_priority = MAX_USER_RT_PRIO/2,
- };
if (!secondary) {
t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
@@ -1318,13 +1315,12 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
} else {
t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq,
new->name);
- param.sched_priority -= 1;
}
if (IS_ERR(t))
return PTR_ERR(t);
- sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
+ sched_set_fifo(t);
/*
* We keep the reference to the task struct even if
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 6afae0bcbac4..6b8368be89c8 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -96,7 +96,7 @@ struct kcov_percpu_data {
int saved_sequence;
};
-DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
+static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
/* Must be called with kcov_remote_lock locked. */
static struct kcov_remote *kcov_remote_find(u64 handle)
@@ -775,7 +775,7 @@ static inline bool kcov_mode_enabled(unsigned int mode)
return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED;
}
-void kcov_remote_softirq_start(struct task_struct *t)
+static void kcov_remote_softirq_start(struct task_struct *t)
{
struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
unsigned int mode;
@@ -792,7 +792,7 @@ void kcov_remote_softirq_start(struct task_struct *t)
}
}
-void kcov_remote_softirq_stop(struct task_struct *t)
+static void kcov_remote_softirq_stop(struct task_struct *t)
{
struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 09cc78df53c6..78c0837bfd7b 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -265,7 +265,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
goto out;
}
- ima_kexec_cmdline(image->cmdline_buf,
+ ima_kexec_cmdline(kernel_fd, image->cmdline_buf,
image->cmdline_buf_len - 1);
}
@@ -636,6 +636,19 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
}
/**
+ * arch_kexec_locate_mem_hole - Find free memory to place the segments.
+ * @kbuf: Parameters for the memory search.
+ *
+ * On success, kbuf->mem will have the start address of the memory region found.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int __weak arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
+{
+ return kexec_locate_mem_hole(kbuf);
+}
+
+/**
* kexec_add_buffer - place a buffer in a kexec segment
* @kbuf: Buffer contents and memory parameters.
*
@@ -647,7 +660,6 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
*/
int kexec_add_buffer(struct kexec_buf *kbuf)
{
-
struct kexec_segment *ksegment;
int ret;
@@ -675,7 +687,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
/* Walk the RAM ranges and allocate a suitable range for the buffer */
- ret = kexec_locate_mem_hole(kbuf);
+ ret = arch_kexec_locate_mem_hole(kbuf);
if (ret)
return ret;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 37c3c4b97b8e..3cd075ce2a1e 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -36,9 +36,8 @@
*
* If you need less than 50 threads would mean we're dealing with systems
* smaller than 3200 pages. This assumes you are capable of having ~13M memory,
- * and this would only be an be an upper limit, after which the OOM killer
- * would take effect. Systems like these are very unlikely if modules are
- * enabled.
+ * and this would only be an upper limit, after which the OOM killer would take
+ * effect. Systems like these are very unlikely if modules are enabled.
*/
#define MAX_KMOD_CONCURRENT 50
static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e87679a48ba2..287b263c9cb9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1111,9 +1111,20 @@ static int disarm_kprobe_ftrace(struct kprobe *p)
ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}
#else /* !CONFIG_KPROBES_ON_FTRACE */
-#define prepare_kprobe(p) arch_prepare_kprobe(p)
-#define arm_kprobe_ftrace(p) (-ENODEV)
-#define disarm_kprobe_ftrace(p) (-ENODEV)
+static inline int prepare_kprobe(struct kprobe *p)
+{
+ return arch_prepare_kprobe(p);
+}
+
+static inline int arm_kprobe_ftrace(struct kprobe *p)
+{
+ return -ENODEV;
+}
+
+static inline int disarm_kprobe_ftrace(struct kprobe *p)
+{
+ return -ENODEV;
+}
#endif
/* Arm a kprobe with text_mutex */
@@ -2145,6 +2156,13 @@ static void kill_kprobe(struct kprobe *p)
* the original probed function (which will be freed soon) any more.
*/
arch_remove_kprobe(p);
+
+ /*
+ * The module is going away. We should disarm the kprobe which
+ * is using ftrace.
+ */
+ if (kprobe_ftrace(p))
+ disarm_kprobe_ftrace(p);
}
/* Disable one kprobe */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1d9e2fdfd67a..3edaa380dc7b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -480,7 +480,6 @@ EXPORT_SYMBOL(kthread_bind);
* to "name.*%u". Code fills in cpu number.
*
* Description: This helper function creates and names a kernel thread
- * The thread will be woken and put into park mode.
*/
struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
void *data, unsigned int cpu,
@@ -1241,13 +1240,16 @@ void kthread_use_mm(struct mm_struct *mm)
WARN_ON_ONCE(tsk->mm);
task_lock(tsk);
+ /* Hold off tlb flush IPIs while switching mm's */
+ local_irq_disable();
active_mm = tsk->active_mm;
if (active_mm != mm) {
mmgrab(mm);
tsk->active_mm = mm;
}
tsk->mm = mm;
- switch_mm(active_mm, mm, tsk);
+ switch_mm_irqs_off(active_mm, mm, tsk);
+ local_irq_enable();
task_unlock(tsk);
#ifdef finish_arch_post_lock_switch
finish_arch_post_lock_switch();
@@ -1256,8 +1258,7 @@ void kthread_use_mm(struct mm_struct *mm)
if (active_mm != mm)
mmdrop(active_mm);
- to_kthread(tsk)->oldfs = get_fs();
- set_fs(USER_DS);
+ to_kthread(tsk)->oldfs = force_uaccess_begin();
}
EXPORT_SYMBOL_GPL(kthread_use_mm);
@@ -1272,13 +1273,15 @@ void kthread_unuse_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(!tsk->mm);
- set_fs(to_kthread(tsk)->oldfs);
+ force_uaccess_end(to_kthread(tsk)->oldfs);
task_lock(tsk);
sync_mm_rss(mm);
+ local_irq_disable();
tsk->mm = NULL;
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
+ local_irq_enable();
task_unlock(tsk);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 5525cd3ba0c8..02ef87f50df2 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -423,7 +423,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
seq_time(m, lt->min);
seq_time(m, lt->max);
seq_time(m, lt->total);
- seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0);
+ seq_time(m, lt->nr ? div64_u64(lt->total, lt->nr) : 0);
}
static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 8ff6f50e06a0..9cfa5e89cff7 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -436,8 +436,6 @@ static int torture_rtmutex_lock(void) __acquires(torture_rtmutex)
static void torture_rtmutex_boost(struct torture_random_state *trsp)
{
- int policy;
- struct sched_param param;
const unsigned int factor = 50000; /* yes, quite arbitrary */
if (!rt_task(current)) {
@@ -448,8 +446,7 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp)
*/
if (trsp && !(torture_random(trsp) %
(cxt.nrealwriters_stress * factor))) {
- policy = SCHED_FIFO;
- param.sched_priority = MAX_RT_PRIO - 1;
+ sched_set_fifo(current);
} else /* common case, do nothing */
return;
} else {
@@ -462,13 +459,10 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp)
*/
if (!trsp || !(torture_random(trsp) %
(cxt.nrealwriters_stress * factor * 2))) {
- policy = SCHED_NORMAL;
- param.sched_priority = 0;
+ sched_set_normal(current, 0);
} else /* common case, do nothing */
return;
}
-
- sched_setscheduler_nocheck(current, policy, &param);
}
static void torture_rtmutex_delay(struct torture_random_state *trsp)
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index b9515fcc9b29..cbff6ba53d56 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -581,4 +581,11 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
#include "qspinlock_paravirt.h"
#include "qspinlock.c"
+bool nopvspin __initdata;
+static __init int parse_nopvspin(char *arg)
+{
+ nopvspin = true;
+ return 0;
+}
+early_param("nopvspin", parse_nopvspin);
#endif
diff --git a/kernel/module.c b/kernel/module.c
index e7b4ff7e4fd0..1c5cff34d9f2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -422,7 +422,7 @@ static bool each_symbol_in_section(const struct symsearch *arr,
}
/* Returns true as soon as fn returns true, otherwise false. */
-bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
+static bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
struct module *owner,
void *data),
void *data)
@@ -484,7 +484,6 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
}
return false;
}
-EXPORT_SYMBOL_GPL(each_symbol_section);
struct find_symbol_arg {
/* Input */
@@ -496,6 +495,7 @@ struct find_symbol_arg {
struct module *owner;
const s32 *crc;
const struct kernel_symbol *sym;
+ enum mod_license license;
};
static bool check_exported_symbol(const struct symsearch *syms,
@@ -505,9 +505,9 @@ static bool check_exported_symbol(const struct symsearch *syms,
struct find_symbol_arg *fsa = data;
if (!fsa->gplok) {
- if (syms->licence == GPL_ONLY)
+ if (syms->license == GPL_ONLY)
return false;
- if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
+ if (syms->license == WILL_BE_GPL_ONLY && fsa->warn) {
pr_warn("Symbol %s is being used by a non-GPL module, "
"which will not be allowed in the future\n",
fsa->name);
@@ -529,6 +529,7 @@ static bool check_exported_symbol(const struct symsearch *syms,
fsa->owner = owner;
fsa->crc = symversion(syms->crcs, symnum);
fsa->sym = &syms->start[symnum];
+ fsa->license = syms->license;
return true;
}
@@ -585,9 +586,10 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms,
/* Find an exported symbol and return it, along with, (optional) crc and
* (optional) module which owns it. Needs preempt disabled or module_mutex. */
-const struct kernel_symbol *find_symbol(const char *name,
+static const struct kernel_symbol *find_symbol(const char *name,
struct module **owner,
const s32 **crc,
+ enum mod_license *license,
bool gplok,
bool warn)
{
@@ -602,13 +604,14 @@ const struct kernel_symbol *find_symbol(const char *name,
*owner = fsa.owner;
if (crc)
*crc = fsa.crc;
+ if (license)
+ *license = fsa.license;
return fsa.sym;
}
pr_debug("Failed to find symbol %s\n", name);
return NULL;
}
-EXPORT_SYMBOL_GPL(find_symbol);
/*
* Search for module by name: must hold module_mutex (or preempt disabled
@@ -869,7 +872,7 @@ static int add_module_usage(struct module *a, struct module *b)
}
/* Module a uses b: caller needs module_mutex() */
-int ref_module(struct module *a, struct module *b)
+static int ref_module(struct module *a, struct module *b)
{
int err;
@@ -888,7 +891,6 @@ int ref_module(struct module *a, struct module *b)
}
return 0;
}
-EXPORT_SYMBOL_GPL(ref_module);
/* Clear the unload stuff of the module. */
static void module_unload_free(struct module *mod)
@@ -1077,7 +1079,7 @@ void __symbol_put(const char *symbol)
struct module *owner;
preempt_disable();
- if (!find_symbol(symbol, &owner, NULL, true, false))
+ if (!find_symbol(symbol, &owner, NULL, NULL, true, false))
BUG();
module_put(owner);
preempt_enable();
@@ -1169,11 +1171,10 @@ static inline void module_unload_free(struct module *mod)
{
}
-int ref_module(struct module *a, struct module *b)
+static int ref_module(struct module *a, struct module *b)
{
return strong_try_module_get(b);
}
-EXPORT_SYMBOL_GPL(ref_module);
static inline int module_unload_init(struct module *mod)
{
@@ -1356,7 +1357,7 @@ static inline int check_modstruct_version(const struct load_info *info,
* locking is necessary -- use preempt_disable() to placate lockdep.
*/
preempt_disable();
- if (!find_symbol("module_layout", NULL, &crc, true, false)) {
+ if (!find_symbol("module_layout", NULL, &crc, NULL, true, false)) {
preempt_enable();
BUG();
}
@@ -1430,6 +1431,24 @@ static int verify_namespace_is_imported(const struct load_info *info,
return 0;
}
+static bool inherit_taint(struct module *mod, struct module *owner)
+{
+ if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints))
+ return true;
+
+ if (mod->using_gplonly_symbols) {
+ pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n",
+ mod->name, owner->name);
+ return false;
+ }
+
+ if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) {
+ pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n",
+ mod->name, owner->name);
+ set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints);
+ }
+ return true;
+}
/* Resolve a symbol for this module. I.e. if we find one, record usage. */
static const struct kernel_symbol *resolve_symbol(struct module *mod,
@@ -1440,6 +1459,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
struct module *owner;
const struct kernel_symbol *sym;
const s32 *crc;
+ enum mod_license license;
int err;
/*
@@ -1449,11 +1469,19 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
*/
sched_annotate_sleep();
mutex_lock(&module_mutex);
- sym = find_symbol(name, &owner, &crc,
+ sym = find_symbol(name, &owner, &crc, &license,
!(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
if (!sym)
goto unlock;
+ if (license == GPL_ONLY)
+ mod->using_gplonly_symbols = true;
+
+ if (!inherit_taint(mod, owner)) {
+ sym = NULL;
+ goto getname;
+ }
+
if (!check_version(info, name, mod, crc)) {
sym = ERR_PTR(-EINVAL);
goto getname;
@@ -1520,18 +1548,34 @@ struct module_sect_attrs {
struct module_sect_attr attrs[];
};
+#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
struct bin_attribute *battr,
char *buf, loff_t pos, size_t count)
{
struct module_sect_attr *sattr =
container_of(battr, struct module_sect_attr, battr);
+ char bounce[MODULE_SECT_READ_SIZE + 1];
+ size_t wrote;
if (pos != 0)
return -EINVAL;
- return sprintf(buf, "0x%px\n",
- kallsyms_show_value(file->f_cred) ? (void *)sattr->address : NULL);
+ /*
+ * Since we're a binary read handler, we must account for the
+ * trailing NUL byte that sprintf will write: if "buf" is
+ * too small to hold the NUL, or the NUL is exactly the last
+ * byte, the read will look like it got truncated by one byte.
+ * Since there is no way to ask sprintf nicely to not write
+ * the NUL, we have to use a bounce buffer.
+ */
+ wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
+ kallsyms_show_value(file->f_cred)
+ ? (void *)sattr->address : NULL);
+ count = min(count, wrote);
+ memcpy(buf, bounce, count);
+
+ return count;
}
static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -1580,7 +1624,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
goto out;
sect_attrs->nsections++;
sattr->battr.read = module_sect_read;
- sattr->battr.size = 3 /* "0x", "\n" */ + (BITS_PER_LONG / 4);
+ sattr->battr.size = MODULE_SECT_READ_SIZE;
sattr->battr.attr.mode = 0400;
*(gattr++) = &(sattr++)->battr;
}
@@ -2220,7 +2264,7 @@ void *__symbol_get(const char *symbol)
const struct kernel_symbol *sym;
preempt_disable();
- sym = find_symbol(symbol, &owner, NULL, true, true);
+ sym = find_symbol(symbol, &owner, NULL, NULL, true, true);
if (sym && strong_try_module_get(owner))
sym = NULL;
preempt_enable();
@@ -2256,7 +2300,7 @@ static int verify_exported_symbols(struct module *mod)
for (i = 0; i < ARRAY_SIZE(arr); i++) {
for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
if (find_symbol(kernel_symbol_name(s), &owner, NULL,
- true, false)) {
+ NULL, true, false)) {
pr_err("%s: exports duplicate symbol %s"
" (owned by %s)\n",
mod->name, kernel_symbol_name(s),
@@ -4473,7 +4517,6 @@ struct module *__module_address(unsigned long addr)
}
return mod;
}
-EXPORT_SYMBOL_GPL(__module_address);
/*
* is_module_text_address - is this address inside module code?
@@ -4512,7 +4555,6 @@ struct module *__module_text_address(unsigned long addr)
}
return mod;
}
-EXPORT_SYMBOL_GPL(__module_text_address);
/* Don't grab lock, we're oopsing. */
void print_modules(void)
diff --git a/kernel/panic.c b/kernel/panic.c
index e2157ca387c8..aef8872ba843 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -505,7 +505,7 @@ static void do_oops_enter_exit(void)
* Return true if the calling CPU is allowed to print oops-related info.
* This is a bit racy..
*/
-int oops_may_print(void)
+bool oops_may_print(void)
{
return pause_on_oops_flag == 0;
}
@@ -551,7 +551,7 @@ static int init_oops_id(void)
}
late_initcall(init_oops_id);
-void print_oops_end_marker(void)
+static void print_oops_end_marker(void)
{
init_oops_id();
pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 5714f51ba9f8..f33769f97aca 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -795,6 +795,103 @@ int hibernate(void)
return error;
}
+/**
+ * hibernate_quiet_exec - Execute a function with all devices frozen.
+ * @func: Function to execute.
+ * @data: Data pointer to pass to @func.
+ *
+ * Return the @func return value or an error code if it cannot be executed.
+ */
+int hibernate_quiet_exec(int (*func)(void *data), void *data)
+{
+ int error, nr_calls = 0;
+
+ lock_system_sleep();
+
+ if (!hibernate_acquire()) {
+ error = -EBUSY;
+ goto unlock;
+ }
+
+ pm_prepare_console();
+
+ error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
+ if (error) {
+ nr_calls--;
+ goto exit;
+ }
+
+ error = freeze_processes();
+ if (error)
+ goto exit;
+
+ lock_device_hotplug();
+
+ pm_suspend_clear_flags();
+
+ error = platform_begin(true);
+ if (error)
+ goto thaw;
+
+ error = freeze_kernel_threads();
+ if (error)
+ goto thaw;
+
+ error = dpm_prepare(PMSG_FREEZE);
+ if (error)
+ goto dpm_complete;
+
+ suspend_console();
+
+ error = dpm_suspend(PMSG_FREEZE);
+ if (error)
+ goto dpm_resume;
+
+ error = dpm_suspend_end(PMSG_FREEZE);
+ if (error)
+ goto dpm_resume;
+
+ error = platform_pre_snapshot(true);
+ if (error)
+ goto skip;
+
+ error = func(data);
+
+skip:
+ platform_finish(true);
+
+ dpm_resume_start(PMSG_THAW);
+
+dpm_resume:
+ dpm_resume(PMSG_THAW);
+
+ resume_console();
+
+dpm_complete:
+ dpm_complete(PMSG_THAW);
+
+ thaw_kernel_threads();
+
+thaw:
+ platform_end(true);
+
+ unlock_device_hotplug();
+
+ thaw_processes();
+
+exit:
+ __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
+
+ pm_restore_console();
+
+ hibernate_release();
+
+unlock:
+ unlock_system_sleep();
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(hibernate_quiet_exec);
/**
* software_resume - Resume from a saved hibernation image.
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index cef154261fe2..d25749bce7cf 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1663,7 +1663,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
{
unsigned long size;
- size = global_node_page_state(NR_SLAB_RECLAIMABLE)
+ size = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B)
+ global_node_page_state(NR_ACTIVE_ANON)
+ global_node_page_state(NR_INACTIVE_ANON)
+ global_node_page_state(NR_ACTIVE_FILE)
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index ec903d781778..21448d3374e2 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -361,7 +361,6 @@ rcu_perf_writer(void *arg)
int i_max;
long me = (long)arg;
struct rcu_head *rhp = NULL;
- struct sched_param sp;
bool started = false, done = false, alldone = false;
u64 t;
u64 *wdp;
@@ -370,8 +369,7 @@ rcu_perf_writer(void *arg)
VERBOSE_PERFOUT_STRING("rcu_perf_writer task started");
WARN_ON(!wdpp);
set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
- sp.sched_priority = 1;
- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+ sched_set_fifo_low(current);
if (holdoff)
schedule_timeout_uninterruptible(holdoff * HZ);
@@ -427,9 +425,7 @@ retry:
started = true;
if (!done && i >= MIN_MEAS) {
done = true;
- sp.sched_priority = 0;
- sched_setscheduler_nocheck(current,
- SCHED_NORMAL, &sp);
+ sched_set_normal(current, 0);
pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n",
perf_type, PERF_FLAG, me, MIN_MEAS);
if (atomic_inc_return(&n_rcu_perf_writer_finished) >=
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d0d265304d14..f453bf8d2f1e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -895,16 +895,11 @@ static int rcu_torture_boost(void *arg)
unsigned long endtime;
unsigned long oldstarttime;
struct rcu_boost_inflight rbi = { .inflight = 0 };
- struct sched_param sp;
VERBOSE_TOROUT_STRING("rcu_torture_boost started");
/* Set real-time priority. */
- sp.sched_priority = 1;
- if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
- VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!");
- n_rcu_torture_boost_rterror++;
- }
+ sched_set_fifo_low(current);
init_rcu_head_on_stack(&rbi.rcu);
/* Each pass through the following loop does one boost-test cycle. */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ac7198ed3197..8ce77d9ac716 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -59,6 +59,7 @@
#include <linux/sched/clock.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
+#include <linux/kasan.h>
#include "../time/tick-internal.h"
#include "tree.h"
@@ -2890,6 +2891,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
head->func = func;
head->next = NULL;
local_irq_save(flags);
+ kasan_record_aux_stack(head);
rdp = this_cpu_ptr(&rcu_data);
/* Add the callback to our list. */
diff --git a/kernel/regset.c b/kernel/regset.c
new file mode 100644
index 000000000000..586823786f39
--- /dev/null
+++ b/kernel/regset.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/regset.h>
+
+static int __regset_get(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int size,
+ void **data)
+{
+ void *p = *data, *to_free = NULL;
+ int res;
+
+ if (!regset->regset_get)
+ return -EOPNOTSUPP;
+ if (size > regset->n * regset->size)
+ size = regset->n * regset->size;
+ if (!p) {
+ to_free = p = kzalloc(size, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ }
+ res = regset->regset_get(target, regset,
+ (struct membuf){.p = p, .left = size});
+ if (res < 0) {
+ kfree(to_free);
+ return res;
+ }
+ *data = p;
+ return size - res;
+}
+
+int regset_get(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int size,
+ void *data)
+{
+ return __regset_get(target, regset, size, &data);
+}
+EXPORT_SYMBOL(regset_get);
+
+int regset_get_alloc(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int size,
+ void **data)
+{
+ *data = NULL;
+ return __regset_get(target, regset, size, data);
+}
+EXPORT_SYMBOL(regset_get_alloc);
+
+/**
+ * copy_regset_to_user - fetch a thread's user_regset data into user memory
+ * @target: thread to be examined
+ * @view: &struct user_regset_view describing user thread machine state
+ * @setno: index in @view->regsets
+ * @offset: offset into the regset data, in bytes
+ * @size: amount of data to copy, in bytes
+ * @data: user-mode pointer to copy into
+ */
+int copy_regset_to_user(struct task_struct *target,
+ const struct user_regset_view *view,
+ unsigned int setno,
+ unsigned int offset, unsigned int size,
+ void __user *data)
+{
+ const struct user_regset *regset = &view->regsets[setno];
+ void *buf;
+ int ret;
+
+ ret = regset_get_alloc(target, regset, size, &buf);
+ if (ret > 0)
+ ret = copy_to_user(data, buf, ret) ? -EFAULT : 0;
+ kfree(buf);
+ return ret;
+}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4a0e7b449b88..84758f34cdb0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5496,6 +5496,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
* @policy: new policy.
* @param: structure containing the new RT priority.
*
+ * Use sched_set_fifo(), read its comment.
+ *
* Return: 0 on success. An error code otherwise.
*
* NOTE that the task may be already dead.
@@ -5505,13 +5507,11 @@ int sched_setscheduler(struct task_struct *p, int policy,
{
return _sched_setscheduler(p, policy, param, true);
}
-EXPORT_SYMBOL_GPL(sched_setscheduler);
int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
{
return __sched_setscheduler(p, attr, true, true);
}
-EXPORT_SYMBOL_GPL(sched_setattr);
int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
{
@@ -5536,7 +5536,51 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
{
return _sched_setscheduler(p, policy, param, false);
}
-EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
+
+/*
+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
+ * incapable of resource management, which is the one thing an OS really should
+ * be doing.
+ *
+ * This is of course the reason it is limited to privileged users only.
+ *
+ * Worse still; it is fundamentally impossible to compose static priority
+ * workloads. You cannot take two correctly working static prio workloads
+ * and smash them together and still expect them to work.
+ *
+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
+ *
+ * MAX_RT_PRIO / 2
+ *
+ * The administrator _MUST_ configure the system, the kernel simply doesn't
+ * know enough information to make a sensible choice.
+ */
+void sched_set_fifo(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo);
+
+/*
+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
+ */
+void sched_set_fifo_low(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = 1 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
+
+void sched_set_normal(struct task_struct *p, int nice)
+{
+ struct sched_attr attr = {
+ .sched_policy = SCHED_NORMAL,
+ .sched_nice = nice,
+ };
+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_normal);
static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index e53b711bd643..967732c0766c 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -616,11 +616,8 @@ out:
static int psi_poll_worker(void *data)
{
struct psi_group *group = (struct psi_group *)data;
- struct sched_param param = {
- .sched_priority = 1,
- };
- sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
+ sched_set_fifo_low(current);
while (true) {
wait_event_interruptible(group->poll_wait,
diff --git a/kernel/scs.c b/kernel/scs.c
index 5d4d9bbdec36..4ff4a7ba0094 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -17,7 +17,7 @@ static void __scs_account(void *s, int account)
{
struct page *scs_page = virt_to_page(s);
- mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB,
+ mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB,
account * (SCS_SIZE / SZ_1K));
}
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 2af66e449aa6..946f44a9e86a 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -233,10 +233,9 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
if (current->flags & PF_KTHREAD)
return 0;
- fs = get_fs();
- set_fs(USER_DS);
+ fs = force_uaccess_begin();
arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
- set_fs(fs);
+ force_uaccess_end(fs);
return c.len;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1b4d2dc270a5..287862f91717 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2671,7 +2671,7 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_overcommit_memory,
.maxlen = sizeof(sysctl_overcommit_memory),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = overcommit_policy_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
@@ -2852,6 +2852,15 @@ static struct ctl_table vm_table[] = {
.proc_handler = sysctl_compaction_handler,
},
{
+ .procname = "compaction_proactiveness",
+ .data = &sysctl_compaction_proactiveness,
+ .maxlen = sizeof(sysctl_compaction_proactiveness),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &one_hundred,
+ },
+ {
.procname = "extfrag_threshold",
.data = &sysctl_extfrag_threshold,
.maxlen = sizeof(int),
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d89da1c7e005..c4038511d5c9 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -135,7 +135,11 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
* timer->base->cpu_base
*/
static struct hrtimer_cpu_base migration_cpu_base = {
- .clock_base = { { .cpu_base = &migration_cpu_base, }, },
+ .clock_base = { {
+ .cpu_base = &migration_cpu_base,
+ .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
+ &migration_cpu_base.lock),
+ }, },
};
#define migration_base migration_cpu_base.clock_base[0]
@@ -1998,8 +2002,11 @@ int hrtimers_prepare_cpu(unsigned int cpu)
int i;
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
- cpu_base->clock_base[i].cpu_base = cpu_base;
- timerqueue_init_head(&cpu_base->clock_base[i].active);
+ struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];
+
+ clock_b->cpu_base = cpu_base;
+ seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
+ timerqueue_init_head(&clock_b->active);
}
cpu_base->cpu = cpu;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 63a632f9896c..406306b33452 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -39,18 +39,19 @@ enum timekeeping_adv_mode {
TK_ADV_FREQ
};
+static DEFINE_RAW_SPINLOCK(timekeeper_lock);
+
/*
* The most important data for readout fits into a single 64 byte
* cache line.
*/
static struct {
- seqcount_t seq;
+ seqcount_raw_spinlock_t seq;
struct timekeeper timekeeper;
} tk_core ____cacheline_aligned = {
- .seq = SEQCNT_ZERO(tk_core.seq),
+ .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock),
};
-static DEFINE_RAW_SPINLOCK(timekeeper_lock);
static struct timekeeper shadow_timekeeper;
/**
@@ -63,7 +64,7 @@ static struct timekeeper shadow_timekeeper;
* See @update_fast_timekeeper() below.
*/
struct tk_fast {
- seqcount_t seq;
+ seqcount_raw_spinlock_t seq;
struct tk_read_base base[2];
};
@@ -80,11 +81,13 @@ static struct clocksource dummy_clock = {
};
static struct tk_fast tk_fast_mono ____cacheline_aligned = {
+ .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock),
.base[0] = { .clock = &dummy_clock, },
.base[1] = { .clock = &dummy_clock, },
};
static struct tk_fast tk_fast_raw ____cacheline_aligned = {
+ .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock),
.base[0] = { .clock = &dummy_clock, },
.base[1] = { .clock = &dummy_clock, },
};
@@ -157,7 +160,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
* tk_clock_read - atomic clocksource read() helper
*
* This helper is necessary to use in the read paths because, while the
- * seqlock ensures we don't return a bad value while structures are updated,
+ * seqcount ensures we don't return a bad value while structures are updated,
* it doesn't protect from potential crashes. There is the possibility that
* the tkr's clocksource may change between the read reference, and the
* clock reference passed to the read function. This can cause crashes if
@@ -222,10 +225,10 @@ static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
unsigned int seq;
/*
- * Since we're called holding a seqlock, the data may shift
+ * Since we're called holding a seqcount, the data may shift
* under us while we're doing the calculation. This can cause
* false positives, since we'd note a problem but throw the
- * results away. So nest another seqlock here to atomically
+ * results away. So nest another seqcount here to atomically
* grab the points we are checking with.
*/
do {
@@ -486,7 +489,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
*
* To keep it NMI safe since we're accessing from tracing, we're not using a
* separate timekeeper with updates to monotonic clock and boot offset
- * protected with seqlocks. This has the following minor side effects:
+ * protected with seqcounts. This has the following minor side effects:
*
* (1) Its possible that a timestamp be taken after the boot offset is updated
* but before the timekeeper is updated. If this happens, the new boot offset
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index aeba5ee7325a..e153be351548 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -2,9 +2,9 @@
# Do not instrument the tracer itself:
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
ifdef CONFIG_FUNCTION_TRACER
-ORIG_CFLAGS := $(KBUILD_CFLAGS)
-KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
# Avoid recursion due to instrumentation.
KCSAN_SANITIZE := n
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 72064541bef2..275441254bb5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -139,9 +139,6 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
#endif
}
-#define FTRACE_PID_IGNORE -1
-#define FTRACE_PID_TRACE -2
-
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
@@ -2388,6 +2385,14 @@ struct ftrace_ops direct_ops = {
.flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE
| FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
| FTRACE_OPS_FL_PERMANENT,
+ /*
+ * By declaring the main trampoline as this trampoline
+ * it will never have one allocated for it. Allocated
+ * trampolines should not call direct functions.
+ * The direct_ops should only be called by the builtin
+ * ftrace_regs_caller trampoline.
+ */
+ .trampoline = FTRACE_REGS_ADDR,
};
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
@@ -6255,8 +6260,19 @@ static int referenced_filters(struct dyn_ftrace *rec)
int cnt = 0;
for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
- if (ops_references_rec(ops, rec))
- cnt++;
+ if (ops_references_rec(ops, rec)) {
+ if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT))
+ continue;
+ if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ continue;
+ cnt++;
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+ rec->flags |= FTRACE_FL_REGS;
+ if (cnt == 1 && ops->trampoline)
+ rec->flags |= FTRACE_FL_TRAMP;
+ else
+ rec->flags &= ~FTRACE_FL_TRAMP;
+ }
}
return cnt;
@@ -6435,8 +6451,8 @@ void ftrace_module_enable(struct module *mod)
if (ftrace_start_up)
cnt += referenced_filters(rec);
- /* This clears FTRACE_FL_DISABLED */
- rec->flags = cnt;
+ rec->flags &= ~FTRACE_FL_DISABLED;
+ rec->flags += cnt;
if (ftrace_start_up && cnt) {
int failed = __ftrace_replace_code(rec, 1);
@@ -7066,12 +7082,12 @@ void ftrace_pid_follow_fork(struct trace_array *tr, bool enable)
if (enable) {
register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
tr);
- register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+ register_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
tr);
} else {
unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
tr);
- unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+ unregister_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
tr);
}
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f15471ce969e..93ef0ab6ea20 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -270,6 +270,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
#define for_each_buffer_cpu(buffer, cpu) \
for_each_cpu(cpu, buffer->cpumask)
+#define for_each_online_buffer_cpu(buffer, cpu) \
+ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask)
+
#define TS_SHIFT 27
#define TS_MASK ((1ULL << TS_SHIFT) - 1)
#define TS_DELTA_TEST (~TS_MASK)
@@ -413,12 +416,27 @@ struct rb_irq_work {
struct rb_event_info {
u64 ts;
u64 delta;
+ u64 before;
+ u64 after;
unsigned long length;
struct buffer_page *tail_page;
int add_timestamp;
};
/*
+ * Used for the add_timestamp
+ * NONE
+ * EXTEND - wants a time extend
+ * ABSOLUTE - the buffer requests all events to have absolute time stamps
+ * FORCE - force a full time stamp.
+ */
+enum {
+ RB_ADD_STAMP_NONE = 0,
+ RB_ADD_STAMP_EXTEND = BIT(1),
+ RB_ADD_STAMP_ABSOLUTE = BIT(2),
+ RB_ADD_STAMP_FORCE = BIT(3)
+};
+/*
* Used for which event context the event is in.
* NMI = 0
* IRQ = 1
@@ -435,6 +453,28 @@ enum {
RB_CTX_MAX
};
+#if BITS_PER_LONG == 32
+#define RB_TIME_32
+#endif
+
+/* To test on 64 bit machines */
+//#define RB_TIME_32
+
+#ifdef RB_TIME_32
+
+struct rb_time_struct {
+ local_t cnt;
+ local_t top;
+ local_t bottom;
+};
+#else
+#include <asm/local64.h>
+struct rb_time_struct {
+ local64_t time;
+};
+#endif
+typedef struct rb_time_struct rb_time_t;
+
/*
* head_page == tail_page && head == tail then buffer is empty.
*/
@@ -470,7 +510,8 @@ struct ring_buffer_per_cpu {
size_t shortest_full;
unsigned long read;
unsigned long read_bytes;
- u64 write_stamp;
+ rb_time_t write_stamp;
+ rb_time_t before_stamp;
u64 read_stamp;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
@@ -513,6 +554,189 @@ struct ring_buffer_iter {
int missed_events;
};
+#ifdef RB_TIME_32
+
+/*
+ * On 32 bit machines, local64_t is very expensive. As the ring
+ * buffer doesn't need all the features of a true 64 bit atomic,
+ * on 32 bit, it uses these functions (64 still uses local64_t).
+ *
+ * For the ring buffer, 64 bit required operations for the time is
+ * the following:
+ *
+ * - Only need 59 bits (uses 60 to make it even).
+ * - Reads may fail if it interrupted a modification of the time stamp.
+ * It will succeed if it did not interrupt another write even if
+ * the read itself is interrupted by a write.
+ * It returns whether it was successful or not.
+ *
+ * - Writes always succeed and will overwrite other writes and writes
+ * that were done by events interrupting the current write.
+ *
+ * - A write followed by a read of the same time stamp will always succeed,
+ * but may not contain the same value.
+ *
+ * - A cmpxchg will fail if it interrupted another write or cmpxchg.
+ * Other than that, it acts like a normal cmpxchg.
+ *
+ * The 60 bit time stamp is broken up by 30 bits in a top and bottom half
+ * (bottom being the least significant 30 bits of the 60 bit time stamp).
+ *
+ * The two most significant bits of each half holds a 2 bit counter (0-3).
+ * Each update will increment this counter by one.
+ * When reading the top and bottom, if the two counter bits match then the
+ * top and bottom together make a valid 60 bit number.
+ */
+#define RB_TIME_SHIFT 30
+#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
+
+static inline int rb_time_cnt(unsigned long val)
+{
+ return (val >> RB_TIME_SHIFT) & 3;
+}
+
+static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
+{
+ u64 val;
+
+ val = top & RB_TIME_VAL_MASK;
+ val <<= RB_TIME_SHIFT;
+ val |= bottom & RB_TIME_VAL_MASK;
+
+ return val;
+}
+
+static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
+{
+ unsigned long top, bottom;
+ unsigned long c;
+
+ /*
+ * If the read is interrupted by a write, then the cnt will
+ * be different. Loop until both top and bottom have been read
+ * without interruption.
+ */
+ do {
+ c = local_read(&t->cnt);
+ top = local_read(&t->top);
+ bottom = local_read(&t->bottom);
+ } while (c != local_read(&t->cnt));
+
+ *cnt = rb_time_cnt(top);
+
+ /* If top and bottom counts don't match, this interrupted a write */
+ if (*cnt != rb_time_cnt(bottom))
+ return false;
+
+ *ret = rb_time_val(top, bottom);
+ return true;
+}
+
+static bool rb_time_read(rb_time_t *t, u64 *ret)
+{
+ unsigned long cnt;
+
+ return __rb_time_read(t, ret, &cnt);
+}
+
+static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt)
+{
+ return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
+}
+
+static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom)
+{
+ *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
+ *bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
+}
+
+static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
+{
+ val = rb_time_val_cnt(val, cnt);
+ local_set(t, val);
+}
+
+static void rb_time_set(rb_time_t *t, u64 val)
+{
+ unsigned long cnt, top, bottom;
+
+ rb_time_split(val, &top, &bottom);
+
+ /* Writes always succeed with a valid number even if it gets interrupted. */
+ do {
+ cnt = local_inc_return(&t->cnt);
+ rb_time_val_set(&t->top, top, cnt);
+ rb_time_val_set(&t->bottom, bottom, cnt);
+ } while (cnt != local_read(&t->cnt));
+}
+
+static inline bool
+rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
+{
+ unsigned long ret;
+
+ ret = local_cmpxchg(l, expect, set);
+ return ret == expect;
+}
+
+static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+{
+ unsigned long cnt, top, bottom;
+ unsigned long cnt2, top2, bottom2;
+ u64 val;
+
+ /* The cmpxchg always fails if it interrupted an update */
+ if (!__rb_time_read(t, &val, &cnt2))
+ return false;
+
+ if (val != expect)
+ return false;
+
+ cnt = local_read(&t->cnt);
+ if ((cnt & 3) != cnt2)
+ return false;
+
+ cnt2 = cnt + 1;
+
+ rb_time_split(val, &top, &bottom);
+ top = rb_time_val_cnt(top, cnt);
+ bottom = rb_time_val_cnt(bottom, cnt);
+
+ rb_time_split(set, &top2, &bottom2);
+ top2 = rb_time_val_cnt(top2, cnt2);
+ bottom2 = rb_time_val_cnt(bottom2, cnt2);
+
+ if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
+ return false;
+ if (!rb_time_read_cmpxchg(&t->top, top, top2))
+ return false;
+ if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
+ return false;
+ return true;
+}
+
+#else /* 64 bits */
+
+/* local64_t always succeeds */
+
+static inline bool rb_time_read(rb_time_t *t, u64 *ret)
+{
+ *ret = local64_read(&t->time);
+ return true;
+}
+static void rb_time_set(rb_time_t *t, u64 val)
+{
+ local64_set(&t->time, val);
+}
+
+static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+{
+ u64 val;
+ val = local64_cmpxchg(&t->time, expect, set);
+ return val == expect;
+}
+#endif
+
/**
* ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
* @buffer: The ring_buffer to get the number of pages from
@@ -746,8 +970,16 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
static inline u64 rb_time_stamp(struct trace_buffer *buffer)
{
+ u64 ts;
+
+ /* Skip retpolines :-( */
+ if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local))
+ ts = trace_clock_local();
+ else
+ ts = buffer->clock();
+
/* shift to debug/test normalization and TIME_EXTENTS */
- return buffer->clock() << DEBUG_SHIFT;
+ return ts << DEBUG_SHIFT;
}
u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu)
@@ -2372,8 +2604,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
return NULL;
}
-/* Slow path, do not inline */
-static noinline struct ring_buffer_event *
+/* Slow path */
+static struct ring_buffer_event *
rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
{
if (abs)
@@ -2397,6 +2629,66 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event);
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline bool sched_clock_stable(void)
+{
+ return true;
+}
+#endif
+
+static void
+rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info)
+{
+ u64 write_stamp;
+
+ WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s",
+ (unsigned long long)info->delta,
+ (unsigned long long)info->ts,
+ (unsigned long long)info->before,
+ (unsigned long long)info->after,
+ (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
+ sched_clock_stable() ? "" :
+ "If you just came from a suspend/resume,\n"
+ "please switch to the trace global clock:\n"
+ " echo global > /sys/kernel/debug/tracing/trace_clock\n"
+ "or add trace_clock=global to the kernel command line\n");
+}
+
+static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event **event,
+ struct rb_event_info *info,
+ u64 *delta,
+ unsigned int *length)
+{
+ bool abs = info->add_timestamp &
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
+
+ if (unlikely(info->delta > (1ULL << 59))) {
+ /* did the clock go backwards */
+ if (info->before == info->after && info->before > info->ts) {
+ /* not interrupted */
+ static int once;
+
+ /*
+ * This is possible with a recalibrating of the TSC.
+ * Do not produce a call stack, but just report it.
+ */
+ if (!once) {
+ once++;
+ pr_warn("Ring buffer clock went backwards: %llu -> %llu\n",
+ info->before, info->ts);
+ }
+ } else
+ rb_check_timestamp(cpu_buffer, info);
+ if (!abs)
+ info->delta = 0;
+ }
+ *event = rb_add_time_stamp(*event, info->delta, abs);
+ *length -= RB_LEN_TIME_EXTEND;
+ *delta = 0;
+}
+
/**
* rb_update_event - update event type and data
* @cpu_buffer: The per cpu buffer of the @event
@@ -2416,21 +2708,12 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
unsigned length = info->length;
u64 delta = info->delta;
- /* Only a commit updates the timestamp */
- if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
- delta = 0;
-
/*
* If we need to add a timestamp, then we
* add it to the start of the reserved space.
*/
- if (unlikely(info->add_timestamp)) {
- bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
-
- event = rb_add_time_stamp(event, abs ? info->delta : delta, abs);
- length -= RB_LEN_TIME_EXTEND;
- delta = 0;
- }
+ if (unlikely(info->add_timestamp))
+ rb_add_timestamp(cpu_buffer, &event, info, &delta, &length);
event->time_delta = delta;
length -= RB_EVNT_HDR_SIZE;
@@ -2473,12 +2756,38 @@ static unsigned rb_calculate_event_length(unsigned length)
return length;
}
-#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static inline bool sched_clock_stable(void)
+static __always_inline bool
+rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
{
- return true;
+ unsigned long addr = (unsigned long)event;
+ unsigned long index;
+
+ index = rb_event_index(event);
+ addr &= PAGE_MASK;
+
+ return cpu_buffer->commit_page->page == (void *)addr &&
+ rb_commit_index(cpu_buffer) == index;
+}
+
+static u64 rb_time_delta(struct ring_buffer_event *event)
+{
+ switch (event->type_len) {
+ case RINGBUF_TYPE_PADDING:
+ return 0;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ return ring_buffer_event_time_stamp(event);
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ return 0;
+
+ case RINGBUF_TYPE_DATA:
+ return event->time_delta;
+ default:
+ return 0;
+ }
}
-#endif
static inline int
rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2488,6 +2797,8 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage;
unsigned long index;
unsigned long addr;
+ u64 write_stamp;
+ u64 delta;
new_index = rb_event_index(event);
old_index = new_index + rb_event_ts_length(event);
@@ -2496,10 +2807,32 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
bpage = READ_ONCE(cpu_buffer->tail_page);
+ delta = rb_time_delta(event);
+
+ if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
+ return 0;
+
+ /* Make sure the write stamp is read before testing the location */
+ barrier();
+
if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
unsigned long write_mask =
local_read(&bpage->write) & ~RB_WRITE_MASK;
unsigned long event_length = rb_event_length(event);
+
+ /* Something came in, can't discard */
+ if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
+ write_stamp, write_stamp - delta))
+ return 0;
+
+ /*
+ * If an event were to come in now, it would see that the
+ * write_stamp and the before_stamp are different, and assume
+ * that this event just added itself before updating
+ * the write stamp. The interrupting event will fix the
+ * write stamp for us, and use the before stamp as its delta.
+ */
+
/*
* This is on the tail page. It is possible that
* a write could come in and move the tail page
@@ -2551,10 +2884,6 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
- /* Only update the write stamp if the page has an event */
- if (rb_page_write(cpu_buffer->commit_page))
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
/* add barrier to keep gcc from optimizing too much */
barrier();
}
@@ -2626,54 +2955,10 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
event->time_delta = 1;
}
-static __always_inline bool
-rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- unsigned long addr = (unsigned long)event;
- unsigned long index;
-
- index = rb_event_index(event);
- addr &= PAGE_MASK;
-
- return cpu_buffer->commit_page->page == (void *)addr &&
- rb_commit_index(cpu_buffer) == index;
-}
-
-static __always_inline void
-rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- u64 delta;
-
- /*
- * The event first in the commit queue updates the
- * time stamp.
- */
- if (rb_event_is_commit(cpu_buffer, event)) {
- /*
- * A commit event that is first on a page
- * updates the write timestamp with the page stamp
- */
- if (!rb_event_index(event))
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
- else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
- delta = ring_buffer_event_time_stamp(event);
- cpu_buffer->write_stamp += delta;
- } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
- delta = ring_buffer_event_time_stamp(event);
- cpu_buffer->write_stamp = delta;
- } else
- cpu_buffer->write_stamp += event->time_delta;
- }
-}
-
static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
local_inc(&cpu_buffer->entries);
- rb_update_write_stamp(cpu_buffer, event);
rb_end_commit(cpu_buffer);
}
@@ -2864,58 +3149,138 @@ int ring_buffer_unlock_commit(struct trace_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
-static noinline void
-rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
- struct rb_event_info *info)
-{
- WARN_ONCE(info->delta > (1ULL << 59),
- KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
- (unsigned long long)info->delta,
- (unsigned long long)info->ts,
- (unsigned long long)cpu_buffer->write_stamp,
- sched_clock_stable() ? "" :
- "If you just came from a suspend/resume,\n"
- "please switch to the trace global clock:\n"
- " echo global > /sys/kernel/debug/tracing/trace_clock\n"
- "or add trace_clock=global to the kernel command line\n");
- info->add_timestamp = 1;
-}
-
static struct ring_buffer_event *
__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
struct rb_event_info *info)
{
struct ring_buffer_event *event;
struct buffer_page *tail_page;
- unsigned long tail, write;
-
- /*
- * If the time delta since the last event is too big to
- * hold in the time field of the event, then we append a
- * TIME EXTEND event ahead of the data event.
- */
- if (unlikely(info->add_timestamp))
- info->length += RB_LEN_TIME_EXTEND;
+ unsigned long tail, write, w;
+ bool a_ok;
+ bool b_ok;
/* Don't let the compiler play games with cpu_buffer->tail_page */
tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
- write = local_add_return(info->length, &tail_page->write);
+
+ /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK;
+ barrier();
+ b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
+ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ barrier();
+ info->ts = rb_time_stamp(cpu_buffer->buffer);
+
+ if ((info->add_timestamp & RB_ADD_STAMP_ABSOLUTE)) {
+ info->delta = info->ts;
+ } else {
+ /*
+ * If interrupting an event time update, we may need an
+ * absolute timestamp.
+ * Don't bother if this is the start of a new page (w == 0).
+ */
+ if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) {
+ info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
+ info->length += RB_LEN_TIME_EXTEND;
+ } else {
+ info->delta = info->ts - info->after;
+ if (unlikely(test_time_stamp(info->delta))) {
+ info->add_timestamp |= RB_ADD_STAMP_EXTEND;
+ info->length += RB_LEN_TIME_EXTEND;
+ }
+ }
+ }
+
+ /*B*/ rb_time_set(&cpu_buffer->before_stamp, info->ts);
+
+ /*C*/ write = local_add_return(info->length, &tail_page->write);
/* set write to only the index of the write */
write &= RB_WRITE_MASK;
+
tail = write - info->length;
+ /* See if we shot pass the end of this buffer page */
+ if (unlikely(write > BUF_PAGE_SIZE)) {
+ if (tail != w) {
+ /* before and after may now different, fix it up*/
+ b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
+ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ if (a_ok && b_ok && info->before != info->after)
+ (void)rb_time_cmpxchg(&cpu_buffer->before_stamp,
+ info->before, info->after);
+ }
+ return rb_move_tail(cpu_buffer, tail, info);
+ }
+
+ if (likely(tail == w)) {
+ u64 save_before;
+ bool s_ok;
+
+ /* Nothing interrupted us between A and C */
+ /*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts);
+ barrier();
+ /*E*/ s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before);
+ RB_WARN_ON(cpu_buffer, !s_ok);
+ if (likely(!(info->add_timestamp &
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
+ /* This did not interrupt any time update */
+ info->delta = info->ts - info->after;
+ else
+ /* Just use full timestamp for inerrupting event */
+ info->delta = info->ts;
+ barrier();
+ if (unlikely(info->ts != save_before)) {
+ /* SLOW PATH - Interrupted between C and E */
+
+ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ RB_WARN_ON(cpu_buffer, !a_ok);
+
+ /* Write stamp must only go forward */
+ if (save_before > info->after) {
+ /*
+ * We do not care about the result, only that
+ * it gets updated atomically.
+ */
+ (void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
+ info->after, save_before);
+ }
+ }
+ } else {
+ u64 ts;
+ /* SLOW PATH - Interrupted between A and C */
+ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ /* Was interrupted before here, write_stamp must be valid */
+ RB_WARN_ON(cpu_buffer, !a_ok);
+ ts = rb_time_stamp(cpu_buffer->buffer);
+ barrier();
+ /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
+ info->after < ts) {
+ /* Nothing came after this event between C and E */
+ info->delta = ts - info->after;
+ (void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
+ info->after, info->ts);
+ info->ts = ts;
+ } else {
+ /*
+ * Interrupted beween C and E:
+ * Lost the previous events time stamp. Just set the
+ * delta to zero, and this will be the same time as
+ * the event this event interrupted. And the events that
+ * came after this will still be correct (as they would
+ * have built their delta on the previous event.
+ */
+ info->delta = 0;
+ }
+ info->add_timestamp &= ~RB_ADD_STAMP_FORCE;
+ }
+
/*
* If this is the first commit on the page, then it has the same
* timestamp as the page itself.
*/
- if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
+ if (unlikely(!tail && !(info->add_timestamp &
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
info->delta = 0;
- /* See if we shot pass the end of this buffer page */
- if (unlikely(write > BUF_PAGE_SIZE))
- return rb_move_tail(cpu_buffer, tail, info);
-
/* We reserved something on the buffer */
event = __rb_page_index(tail_page, tail);
@@ -2927,7 +3292,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
* If this is the first commit on the page, then update
* its timestamp.
*/
- if (!tail)
+ if (unlikely(!tail))
tail_page->page->time_stamp = info->ts;
/* account for these added bytes */
@@ -2944,9 +3309,10 @@ rb_reserve_next_event(struct trace_buffer *buffer,
struct ring_buffer_event *event;
struct rb_event_info info;
int nr_loops = 0;
- u64 diff;
+ int add_ts_default;
rb_start_commit(cpu_buffer);
+ /* The commit page can not change after this */
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
/*
@@ -2964,8 +3330,16 @@ rb_reserve_next_event(struct trace_buffer *buffer,
#endif
info.length = rb_calculate_event_length(length);
+
+ if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
+ add_ts_default = RB_ADD_STAMP_ABSOLUTE;
+ info.length += RB_LEN_TIME_EXTEND;
+ } else {
+ add_ts_default = RB_ADD_STAMP_NONE;
+ }
+
again:
- info.add_timestamp = 0;
+ info.add_timestamp = add_ts_default;
info.delta = 0;
/*
@@ -2980,35 +3354,16 @@ rb_reserve_next_event(struct trace_buffer *buffer,
if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
goto out_fail;
- info.ts = rb_time_stamp(cpu_buffer->buffer);
- diff = info.ts - cpu_buffer->write_stamp;
-
- /* make sure this diff is calculated here */
- barrier();
-
- if (ring_buffer_time_stamp_abs(buffer)) {
- info.delta = info.ts;
- rb_handle_timestamp(cpu_buffer, &info);
- } else /* Did the write stamp get updated already? */
- if (likely(info.ts >= cpu_buffer->write_stamp)) {
- info.delta = diff;
- if (unlikely(test_time_stamp(info.delta)))
- rb_handle_timestamp(cpu_buffer, &info);
- }
-
event = __rb_reserve_next(cpu_buffer, &info);
if (unlikely(PTR_ERR(event) == -EAGAIN)) {
- if (info.add_timestamp)
+ if (info.add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND))
info.length -= RB_LEN_TIME_EXTEND;
goto again;
}
- if (!event)
- goto out_fail;
-
- return event;
-
+ if (likely(event))
+ return event;
out_fail:
rb_end_commit(cpu_buffer);
return NULL;
@@ -3154,11 +3509,6 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer,
if (rb_try_to_discard(cpu_buffer, event))
goto out;
- /*
- * The commit is still visible by the reader, so we
- * must still update the timestamp.
- */
- rb_update_write_stamp(cpu_buffer, event);
out:
rb_end_commit(cpu_buffer);
@@ -4475,8 +4825,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->read = 0;
cpu_buffer->read_bytes = 0;
- cpu_buffer->write_stamp = 0;
- cpu_buffer->read_stamp = 0;
+ rb_time_set(&cpu_buffer->write_stamp, 0);
+ rb_time_set(&cpu_buffer->before_stamp, 0);
cpu_buffer->lost_events = 0;
cpu_buffer->last_overrun = 0;
@@ -4484,6 +4834,26 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
rb_head_page_activate(cpu_buffer);
}
+/* Must have disabled the cpu buffer then done a synchronize_rcu */
+static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
+ goto out;
+
+ arch_spin_lock(&cpu_buffer->lock);
+
+ rb_reset_cpu(cpu_buffer);
+
+ arch_spin_unlock(&cpu_buffer->lock);
+
+ out:
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+}
+
/**
* ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
* @buffer: The ring buffer to reset a per cpu buffer of
@@ -4492,7 +4862,6 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
- unsigned long flags;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
@@ -4503,24 +4872,42 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
/* Make sure all commits have finished */
synchronize_rcu();
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ reset_disabled_cpu_buffer(cpu_buffer);
- if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
- goto out;
+ atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&cpu_buffer->resize_disabled);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
- arch_spin_lock(&cpu_buffer->lock);
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
- rb_reset_cpu(cpu_buffer);
+ for_each_online_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
- arch_spin_unlock(&cpu_buffer->lock);
+ atomic_inc(&cpu_buffer->resize_disabled);
+ atomic_inc(&cpu_buffer->record_disabled);
+ }
- out:
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ /* Make sure all commits have finished */
+ synchronize_rcu();
- atomic_dec(&cpu_buffer->record_disabled);
- atomic_dec(&cpu_buffer->resize_disabled);
+ for_each_online_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ reset_disabled_cpu_buffer(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&cpu_buffer->resize_disabled);
+ }
}
-EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
/**
* ring_buffer_reset - reset a ring buffer
@@ -4528,10 +4915,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
*/
void ring_buffer_reset(struct trace_buffer *buffer)
{
+ struct ring_buffer_per_cpu *cpu_buffer;
int cpu;
- for_each_buffer_cpu(buffer, cpu)
- ring_buffer_reset_cpu(buffer, cpu);
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ atomic_inc(&cpu_buffer->resize_disabled);
+ atomic_inc(&cpu_buffer->record_disabled);
+ }
+
+ /* Make sure all commits have finished */
+ synchronize_rcu();
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ reset_disabled_cpu_buffer(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&cpu_buffer->resize_disabled);
+ }
}
EXPORT_SYMBOL_GPL(ring_buffer_reset);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 8df0aa810950..78e576575b79 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -45,8 +45,8 @@ MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
static int producer_nice = MAX_NICE;
static int consumer_nice = MAX_NICE;
-static int producer_fifo = -1;
-static int consumer_fifo = -1;
+static int producer_fifo;
+static int consumer_fifo;
module_param(producer_nice, int, 0644);
MODULE_PARM_DESC(producer_nice, "nice prio for producer");
@@ -55,10 +55,10 @@ module_param(consumer_nice, int, 0644);
MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
module_param(producer_fifo, int, 0644);
-MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
+MODULE_PARM_DESC(producer_fifo, "use fifo for producer: 0 - disabled, 1 - low prio, 2 - fifo");
module_param(consumer_fifo, int, 0644);
-MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
+MODULE_PARM_DESC(consumer_fifo, "use fifo for consumer: 0 - disabled, 1 - low prio, 2 - fifo");
static int read_events;
@@ -303,22 +303,22 @@ static void ring_buffer_producer(void)
trace_printk("ERROR!\n");
if (!disable_reader) {
- if (consumer_fifo < 0)
+ if (consumer_fifo)
+ trace_printk("Running Consumer at SCHED_FIFO %s\n",
+ consumer_fifo == 1 ? "low" : "high");
+ else
trace_printk("Running Consumer at nice: %d\n",
consumer_nice);
- else
- trace_printk("Running Consumer at SCHED_FIFO %d\n",
- consumer_fifo);
}
- if (producer_fifo < 0)
+ if (producer_fifo)
+ trace_printk("Running Producer at SCHED_FIFO %s\n",
+ producer_fifo == 1 ? "low" : "high");
+ else
trace_printk("Running Producer at nice: %d\n",
producer_nice);
- else
- trace_printk("Running Producer at SCHED_FIFO %d\n",
- producer_fifo);
/* Let the user know that the test is running at low priority */
- if (producer_fifo < 0 && consumer_fifo < 0 &&
+ if (!producer_fifo && !consumer_fifo &&
producer_nice == MAX_NICE && consumer_nice == MAX_NICE)
trace_printk("WARNING!!! This test is running at lowest priority.\n");
@@ -455,21 +455,19 @@ static int __init ring_buffer_benchmark_init(void)
* Run them as low-prio background tasks by default:
*/
if (!disable_reader) {
- if (consumer_fifo >= 0) {
- struct sched_param param = {
- .sched_priority = consumer_fifo
- };
- sched_setscheduler(consumer, SCHED_FIFO, &param);
- } else
+ if (consumer_fifo >= 2)
+ sched_set_fifo(consumer);
+ else if (consumer_fifo == 1)
+ sched_set_fifo_low(consumer);
+ else
set_user_nice(consumer, consumer_nice);
}
- if (producer_fifo >= 0) {
- struct sched_param param = {
- .sched_priority = producer_fifo
- };
- sched_setscheduler(producer, SCHED_FIFO, &param);
- } else
+ if (producer_fifo >= 2)
+ sched_set_fifo(producer);
+ else if (producer_fifo == 1)
+ sched_set_fifo_low(producer);
+ else
set_user_nice(producer, producer_nice);
return 0;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 848f67a5f16d..f40d850ebabc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1543,8 +1543,7 @@ static void latency_fsnotify_workfn(struct work_struct *work)
{
struct trace_array *tr = container_of(work, struct trace_array,
fsnotify_work);
- fsnotify(tr->d_max_latency->d_inode, FS_MODIFY,
- tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+ fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY);
}
static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
@@ -2003,7 +2002,6 @@ static void tracing_reset_cpu(struct array_buffer *buf, int cpu)
void tracing_reset_online_cpus(struct array_buffer *buf)
{
struct trace_buffer *buffer = buf->buffer;
- int cpu;
if (!buffer)
return;
@@ -2015,8 +2013,7 @@ void tracing_reset_online_cpus(struct array_buffer *buf)
buf->time_start = buffer_ftrace_now(buf, buf->cpu);
- for_each_online_cpu(cpu)
- ring_buffer_reset_cpu(buffer, cpu);
+ ring_buffer_reset_online_cpus(buffer);
ring_buffer_record_enable(buffer);
}
@@ -2932,12 +2929,6 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
skip++;
#endif
- /*
- * Since events can happen in NMIs there's no safe way to
- * use the per cpu ftrace_stacks. We reserve it and if an interrupt
- * or NMI comes in, it will just have to use the default
- * FTRACE_STACK_SIZE.
- */
preempt_disable_notrace();
stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;
@@ -3137,6 +3128,9 @@ static int alloc_percpu_trace_buffer(void)
{
struct trace_buffer_struct *buffers;
+ if (trace_percpu_buffer)
+ return 0;
+
buffers = alloc_percpu(struct trace_buffer_struct);
if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer"))
return -ENOMEM;
@@ -3339,6 +3333,26 @@ int trace_array_vprintk(struct trace_array *tr,
return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args);
}
+/**
+ * trace_array_printk - Print a message to a specific instance
+ * @tr: The instance trace_array descriptor
+ * @ip: The instruction pointer that this is called from.
+ * @fmt: The format to print (printf format)
+ *
+ * If a subsystem sets up its own instance, they have the right to
+ * printk strings into their tracing instance buffer using this
+ * function. Note, this function will not write into the top level
+ * buffer (use trace_printk() for that), as writing into the top level
+ * buffer should only have events that can be individually disabled.
+ * trace_printk() is only used for debugging a kernel, and should not
+ * be ever encorporated in normal use.
+ *
+ * trace_array_printk() can be used, as it will not add noise to the
+ * top level tracing buffer.
+ *
+ * Note, trace_array_init_printk() must be called on @tr before this
+ * can be used.
+ */
__printf(3, 0)
int trace_array_printk(struct trace_array *tr,
unsigned long ip, const char *fmt, ...)
@@ -3346,12 +3360,16 @@ int trace_array_printk(struct trace_array *tr,
int ret;
va_list ap;
- if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
- return 0;
-
if (!tr)
return -ENOENT;
+ /* This is only allowed for created instances */
+ if (tr == &global_trace)
+ return 0;
+
+ if (!(tr->trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
va_start(ap, fmt);
ret = trace_array_vprintk(tr, ip, fmt, ap);
va_end(ap);
@@ -3359,6 +3377,27 @@ int trace_array_printk(struct trace_array *tr,
}
EXPORT_SYMBOL_GPL(trace_array_printk);
+/**
+ * trace_array_init_printk - Initialize buffers for trace_array_printk()
+ * @tr: The trace array to initialize the buffers for
+ *
+ * As trace_array_printk() only writes into instances, they are OK to
+ * have in the kernel (unlike trace_printk()). This needs to be called
+ * before trace_array_printk() can be used on a trace_array.
+ */
+int trace_array_init_printk(struct trace_array *tr)
+{
+ if (!tr)
+ return -ENOENT;
+
+ /* This is only allowed for created instances */
+ if (tr == &global_trace)
+ return -EINVAL;
+
+ return alloc_percpu_trace_buffer();
+}
+EXPORT_SYMBOL_GPL(trace_array_init_printk);
+
__printf(3, 4)
int trace_array_printk_buf(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, ...)
@@ -5887,7 +5926,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
}
/* If trace pipe files are being read, we can't change the tracer */
- if (tr->current_trace->ref) {
+ if (tr->trace_ref) {
ret = -EBUSY;
goto out;
}
@@ -6103,7 +6142,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
nonseekable_open(inode, filp);
- tr->current_trace->ref++;
+ tr->trace_ref++;
out:
mutex_unlock(&trace_types_lock);
return ret;
@@ -6122,7 +6161,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
mutex_lock(&trace_types_lock);
- tr->current_trace->ref--;
+ tr->trace_ref--;
if (iter->trace->pipe_close)
iter->trace->pipe_close(iter);
@@ -7406,7 +7445,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
if (ret)
return ret;
- info = kzalloc(sizeof(*info), GFP_KERNEL);
+ info = kvzalloc(sizeof(*info), GFP_KERNEL);
if (!info) {
trace_array_put(tr);
return -ENOMEM;
@@ -7424,7 +7463,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
filp->private_data = info;
- tr->current_trace->ref++;
+ tr->trace_ref++;
mutex_unlock(&trace_types_lock);
@@ -7525,14 +7564,14 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
mutex_lock(&trace_types_lock);
- iter->tr->current_trace->ref--;
+ iter->tr->trace_ref--;
__trace_array_put(iter->tr);
if (info->spare)
ring_buffer_free_read_page(iter->array_buffer->buffer,
info->spare_cpu, info->spare);
- kfree(info);
+ kvfree(info);
mutex_unlock(&trace_types_lock);
@@ -8733,7 +8772,7 @@ static int __remove_instance(struct trace_array *tr)
int i;
/* Reference counter for a newly created trace array = 1. */
- if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref))
+ if (tr->ref > 1 || (tr->current_trace && tr->trace_ref))
return -EBUSY;
list_del(&tr->list);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 13db4000af3f..610d21355526 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -356,6 +356,7 @@ struct trace_array {
struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
int ref;
+ int trace_ref;
#ifdef CONFIG_FUNCTION_TRACER
struct ftrace_ops *ops;
struct trace_pid_list __rcu *function_pids;
@@ -547,7 +548,6 @@ struct tracer {
struct tracer *next;
struct tracer_flags *flags;
int enabled;
- int ref;
bool print_max;
bool allow_instances;
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -1103,6 +1103,10 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
extern struct list_head ftrace_pids;
#ifdef CONFIG_FUNCTION_TRACER
+
+#define FTRACE_PID_IGNORE -1
+#define FTRACE_PID_TRACE -2
+
struct ftrace_func_command {
struct list_head list;
char *name;
@@ -1114,7 +1118,8 @@ struct ftrace_func_command {
extern bool ftrace_filter_param __initdata;
static inline int ftrace_trace_task(struct trace_array *tr)
{
- return !this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid);
+ return this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid) !=
+ FTRACE_PID_IGNORE;
}
extern int ftrace_is_dead(void);
int ftrace_create_function_files(struct trace_array *tr,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f6f55682d3e2..a85effb2373b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -538,12 +538,12 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
if (enable) {
register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork,
tr, INT_MIN);
- register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit,
+ register_trace_prio_sched_process_free(event_filter_pid_sched_process_exit,
tr, INT_MAX);
} else {
unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork,
tr);
- unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit,
+ unregister_trace_sched_process_free(event_filter_pid_sched_process_exit,
tr);
}
}
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index e2be7bb7ef7e..17873e5d0353 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -283,6 +283,7 @@ static bool disable_migrate;
static void move_to_next_cpu(void)
{
struct cpumask *current_mask = &save_cpumask;
+ struct trace_array *tr = hwlat_trace;
int next_cpu;
if (disable_migrate)
@@ -296,7 +297,7 @@ static void move_to_next_cpu(void)
goto disable;
get_online_cpus();
- cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+ cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
next_cpu = cpumask_next(smp_processor_id(), current_mask);
put_online_cpus();
@@ -371,9 +372,8 @@ static int start_kthread(struct trace_array *tr)
return 0;
/* Just pick the first CPU on first iteration */
- current_mask = &save_cpumask;
get_online_cpus();
- cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+ cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
put_online_cpus();
next_cpu = cpumask_first(current_mask);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 73976de7f8cc..4d1893564912 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -20,7 +20,7 @@ DECLARE_RWSEM(trace_event_sem);
static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
-static int next_event_type = __TRACE_LAST_TYPE + 1;
+static int next_event_type = __TRACE_LAST_TYPE;
enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
{
@@ -675,11 +675,11 @@ static LIST_HEAD(ftrace_event_list);
static int trace_search_list(struct list_head **list)
{
struct trace_event *e;
- int last = __TRACE_LAST_TYPE;
+ int next = __TRACE_LAST_TYPE;
if (list_empty(&ftrace_event_list)) {
*list = &ftrace_event_list;
- return last + 1;
+ return next;
}
/*
@@ -687,17 +687,17 @@ static int trace_search_list(struct list_head **list)
* lets see if somebody freed one.
*/
list_for_each_entry(e, &ftrace_event_list, list) {
- if (e->type != last + 1)
+ if (e->type != next)
break;
- last++;
+ next++;
}
/* Did we used up all 65 thousand events??? */
- if ((last + 1) > TRACE_EVENT_TYPE_MAX)
+ if (next > TRACE_EVENT_TYPE_MAX)
return 0;
*list = &e->list;
- return last + 1;
+ return next;
}
void trace_event_read_lock(void)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index fdd47f99b18f..f4286c9bdeb4 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1456,7 +1456,6 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
default:
return 0;
}
- return 0;
}
static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
diff --git a/kernel/umh.c b/kernel/umh.c
index a25433f9cd9a..fcf3ee803630 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -119,37 +119,16 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
{
pid_t pid;
- /* If SIGCLD is ignored kernel_wait4 won't populate the status. */
+ /* If SIGCLD is ignored do_wait won't populate the status. */
kernel_sigaction(SIGCHLD, SIG_DFL);
pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
- if (pid < 0) {
+ if (pid < 0)
sub_info->retval = pid;
- } else {
- int ret = -ECHILD;
- /*
- * Normally it is bogus to call wait4() from in-kernel because
- * wait4() wants to write the exit code to a userspace address.
- * But call_usermodehelper_exec_sync() always runs as kernel
- * thread (workqueue) and put_user() to a kernel address works
- * OK for kernel threads, due to their having an mm_segment_t
- * which spans the entire address space.
- *
- * Thus the __user pointer cast is valid here.
- */
- kernel_wait4(pid, (int __user *)&ret, 0, NULL);
-
- /*
- * If ret is 0, either call_usermodehelper_exec_async failed and
- * the real error code is already in sub_info->retval or
- * sub_info->retval is 0 anyway, so don't mess with it then.
- */
- if (ret)
- sub_info->retval = ret;
- }
+ else
+ kernel_wait(pid, &sub_info->retval);
/* Restore default kernel sig handler */
kernel_sigaction(SIGCHLD, SIG_IGN);
-
umh_complete(sub_info);
}