summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks2
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/acct.c7
-rw-r--r--kernel/audit.c17
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/audit_fsnotify.c10
-rw-r--r--kernel/audit_tree.c20
-rw-r--r--kernel/audit_watch.c12
-rw-r--r--kernel/auditsc.c12
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/arraymap.c22
-rw-r--r--kernel/bpf/bpf_lru_list.c695
-rw-r--r--kernel/bpf/bpf_lru_list.h84
-rw-r--r--kernel/bpf/cgroup.c235
-rw-r--r--kernel/bpf/core.c95
-rw-r--r--kernel/bpf/hashtab.c465
-rw-r--r--kernel/bpf/helpers.c67
-rw-r--r--kernel/bpf/inode.c101
-rw-r--r--kernel/bpf/stackmap.c25
-rw-r--r--kernel/bpf/syscall.c258
-rw-r--r--kernel/bpf/verifier.c1083
-rw-r--r--kernel/capability.c39
-rw-r--r--kernel/cgroup.c137
-rw-r--r--kernel/compat.c10
-rw-r--r--kernel/configs.c2
-rw-r--r--kernel/configs/android-base.config7
-rw-r--r--kernel/configs/android-recommended.config4
-rw-r--r--kernel/configs/kvm_guest.config32
-rw-r--r--kernel/cpu.c744
-rw-r--r--kernel/cpuset.c15
-rw-r--r--kernel/debug/debug_core.c4
-rw-r--r--kernel/debug/kdb/kdb_io.c39
-rw-r--r--kernel/debug/kdb/kdb_main.c1
-rw-r--r--kernel/debug/kdb/kdb_private.h1
-rw-r--r--kernel/delayacct.c6
-rw-r--r--kernel/events/core.c724
-rw-r--r--kernel/events/uprobes.c44
-rw-r--r--kernel/exit.c121
-rw-r--r--kernel/extable.c11
-rw-r--r--kernel/fork.c236
-rw-r--r--kernel/futex.c29
-rw-r--r--kernel/futex_compat.c2
-rw-r--r--kernel/groups.c69
-rw-r--r--kernel/hung_task.c29
-rw-r--r--kernel/irq/affinity.c169
-rw-r--r--kernel/irq/chip.c21
-rw-r--r--kernel/irq/devres.c65
-rw-r--r--kernel/irq/generic-chip.c72
-rw-r--r--kernel/irq/irqdesc.c224
-rw-r--r--kernel/irq/irqdomain.c94
-rw-r--r--kernel/irq/manage.c10
-rw-r--r--kernel/irq/msi.c34
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/jump_label.c7
-rw-r--r--kernel/kcov.c23
-rw-r--r--kernel/kexec_core.c5
-rw-r--r--kernel/kexec_file.c145
-rw-r--r--kernel/kexec_internal.h16
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/kprobes.c77
-rw-r--r--kernel/kthread.c685
-rw-r--r--kernel/livepatch/core.c19
-rw-r--r--kernel/locking/Makefile2
-rw-r--r--kernel/locking/lglock.c111
-rw-r--r--kernel/locking/lockdep.c155
-rw-r--r--kernel/locking/lockdep_internals.h20
-rw-r--r--kernel/locking/lockdep_proc.c2
-rw-r--r--kernel/locking/locktorture.c79
-rw-r--r--kernel/locking/mcs_spinlock.h4
-rw-r--r--kernel/locking/mutex-debug.c13
-rw-r--r--kernel/locking/mutex-debug.h27
-rw-r--r--kernel/locking/mutex.c990
-rw-r--r--kernel/locking/mutex.h30
-rw-r--r--kernel/locking/osq_lock.c15
-rw-r--r--kernel/locking/percpu-rwsem.c229
-rw-r--r--kernel/locking/qrwlock.c6
-rw-r--r--kernel/locking/qspinlock_paravirt.h28
-rw-r--r--kernel/locking/qspinlock_stat.h16
-rw-r--r--kernel/locking/rtmutex.c88
-rw-r--r--kernel/locking/rtmutex_common.h6
-rw-r--r--kernel/locking/rwsem-spinlock.c18
-rw-r--r--kernel/locking/rwsem-xadd.c132
-rw-r--r--kernel/locking/semaphore.c7
-rw-r--r--kernel/locking/spinlock.c8
-rw-r--r--kernel/locking/spinlock_debug.c86
-rw-r--r--kernel/locking/test-ww_mutex.c646
-rw-r--r--kernel/membarrier.c4
-rw-r--r--kernel/memremap.c4
-rw-r--r--kernel/module.c138
-rw-r--r--kernel/padata.c92
-rw-r--r--kernel/panic.c104
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/pid_namespace.c58
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/hibernate.c21
-rw-r--r--kernel/power/main.c89
-rw-r--r--kernel/power/power.h8
-rw-r--r--kernel/power/process.c17
-rw-r--r--kernel/power/qos.c11
-rw-r--r--kernel/power/snapshot.c24
-rw-r--r--kernel/power/suspend.c79
-rw-r--r--kernel/power/suspend_test.c6
-rw-r--r--kernel/power/swap.c21
-rw-r--r--kernel/power/user.c2
-rw-r--r--kernel/printk/nmi.c83
-rw-r--r--kernel/printk/printk.c348
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/ptrace.c77
-rw-r--r--kernel/rcu/rcu.h1
-rw-r--r--kernel/rcu/rcuperf.c7
-rw-r--r--kernel/rcu/rcutorture.c92
-rw-r--r--kernel/rcu/srcu.c143
-rw-r--r--kernel/rcu/sync.c14
-rw-r--r--kernel/rcu/tiny.c8
-rw-r--r--kernel/rcu/tiny_plugin.h9
-rw-r--r--kernel/rcu/tree.c356
-rw-r--r--kernel/rcu/tree.h17
-rw-r--r--kernel/rcu/tree_exp.h190
-rw-r--r--kernel/rcu/tree_plugin.h10
-rw-r--r--kernel/rcu/tree_trace.c10
-rw-r--r--kernel/rcu/update.c47
-rw-r--r--kernel/relay.c228
-rw-r--r--kernel/sched/Makefile4
-rw-r--r--kernel/sched/autogroup.c (renamed from kernel/sched/auto_group.c)40
-rw-r--r--kernel/sched/autogroup.h (renamed from kernel/sched/auto_group.h)0
-rw-r--r--kernel/sched/clock.c158
-rw-r--r--kernel/sched/completion.c10
-rw-r--r--kernel/sched/core.c2477
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/cpudeadline.c153
-rw-r--r--kernel/sched/cpudeadline.h3
-rw-r--r--kernel/sched/cpufreq.c2
-rw-r--r--kernel/sched/cpufreq_schedutil.c241
-rw-r--r--kernel/sched/cputime.c335
-rw-r--r--kernel/sched/deadline.c100
-rw-r--r--kernel/sched/debug.c110
-rw-r--r--kernel/sched/fair.c1539
-rw-r--r--kernel/sched/idle.c186
-rw-r--r--kernel/sched/idle_task.c6
-rw-r--r--kernel/sched/rt.c19
-rw-r--r--kernel/sched/sched.h278
-rw-r--r--kernel/sched/stats.h60
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/sched/topology.c1658
-rw-r--r--kernel/sched/wait.c123
-rw-r--r--kernel/seccomp.c38
-rw-r--r--kernel/signal.c44
-rw-r--r--kernel/smp.c70
-rw-r--r--kernel/smpboot.c5
-rw-r--r--kernel/softirq.c49
-rw-r--r--kernel/stacktrace.c12
-rw-r--r--kernel/stop_machine.c49
-rw-r--r--kernel/sys.c31
-rw-r--r--kernel/sys_ni.c8
-rw-r--r--kernel/sysctl.c52
-rw-r--r--kernel/sysctl_binary.c4
-rw-r--r--kernel/taskstats.c24
-rw-r--r--kernel/time/Makefile11
-rw-r--r--kernel/time/alarmtimer.c84
-rw-r--r--kernel/time/clockevents.c6
-rw-r--r--kernel/time/clocksource.c22
-rw-r--r--kernel/time/hrtimer.c140
-rw-r--r--kernel/time/itimer.c87
-rw-r--r--kernel/time/jiffies.c36
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/posix-cpu-timers.c178
-rw-r--r--kernel/time/posix-stubs.c123
-rw-r--r--kernel/time/posix-timers.c24
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c2
-rw-r--r--kernel/time/tick-broadcast.c57
-rw-r--r--kernel/time/tick-common.c4
-rw-r--r--kernel/time/tick-oneshot.c2
-rw-r--r--kernel/time/tick-sched.c62
-rw-r--r--kernel/time/time.c14
-rw-r--r--kernel/time/timeconst.bc6
-rw-r--r--kernel/time/timecounter.c6
-rw-r--r--kernel/time/timekeeping.c191
-rw-r--r--kernel/time/timekeeping.h2
-rw-r--r--kernel/time/timekeeping_debug.c2
-rw-r--r--kernel/time/timekeeping_internal.h6
-rw-r--r--kernel/time/timer.c174
-rw-r--r--kernel/time/timer_list.c14
-rw-r--r--kernel/time/timer_stats.c425
-rw-r--r--kernel/torture.c27
-rw-r--r--kernel/trace/Kconfig42
-rw-r--r--kernel/trace/Makefile5
-rw-r--r--kernel/trace/blktrace.c92
-rw-r--r--kernel/trace/bpf_trace.c162
-rw-r--r--kernel/trace/ftrace.c55
-rw-r--r--kernel/trace/ring_buffer.c165
-rw-r--r--kernel/trace/trace.c539
-rw-r--r--kernel/trace/trace.h32
-rw-r--r--kernel/trace/trace_benchmark.c26
-rw-r--r--kernel/trace/trace_benchmark.h2
-rw-r--r--kernel/trace/trace_branch.c2
-rw-r--r--kernel/trace/trace_entries.h42
-rw-r--r--kernel/trace/trace_events.c83
-rw-r--r--kernel/trace/trace_events_filter.c119
-rw-r--r--kernel/trace/trace_events_trigger.c1
-rw-r--r--kernel/trace/trace_functions_graph.c108
-rw-r--r--kernel/trace/trace_hwlat.c635
-rw-r--r--kernel/trace/trace_irqsoff.c16
-rw-r--r--kernel/trace/trace_kprobe.c51
-rw-r--r--kernel/trace/trace_output.c96
-rw-r--r--kernel/trace/trace_probe.c30
-rw-r--r--kernel/trace/trace_probe.h11
-rw-r--r--kernel/trace/trace_sched_wakeup.c17
-rw-r--r--kernel/trace/trace_syscalls.c6
-rw-r--r--kernel/trace/trace_uprobe.c8
-rw-r--r--kernel/tracepoint.c12
-rw-r--r--kernel/tsacct.c21
-rw-r--r--kernel/ucount.c236
-rw-r--r--kernel/uid16.c6
-rw-r--r--kernel/up.c18
-rw-r--r--kernel/user_namespace.c99
-rw-r--r--kernel/utsname.c40
-rw-r--r--kernel/watchdog.c279
-rw-r--r--kernel/watchdog_hld.c230
-rw-r--r--kernel/workqueue.c147
220 files changed, 16474 insertions, 8804 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index ebdb0043203a..84d882f3e299 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -225,7 +225,7 @@ config ARCH_SUPPORTS_ATOMIC_RMW
config MUTEX_SPIN_ON_OWNER
def_bool y
- depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+ depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW
config RWSEM_SPIN_ON_OWNER
def_bool y
diff --git a/kernel/Makefile b/kernel/Makefile
index e2ec54e2b952..12c679f769c6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o smpboot.o
+ async.o range.o smpboot.o ucount.o
obj-$(CONFIG_MULTIUSER) += groups.o
@@ -84,6 +84,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
@@ -115,8 +116,6 @@ obj-$(CONFIG_HAS_IOMEM) += memremap.o
$(obj)/configs.o: $(obj)/config_data.h
-# config_data.h contains the same information as ikconfig.h but gzipped.
-# Info from config_data can be extracted from /proc/config*
targets += config_data.gz
$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
diff --git a/kernel/acct.c b/kernel/acct.c
index 74963d192c5d..ca9cb55b5855 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -453,8 +453,8 @@ static void fill_ac(acct_t *ac)
spin_lock_irq(&current->sighand->siglock);
tty = current->signal->tty; /* Safe as we hold the siglock */
ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
- ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
- ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+ ac->ac_utime = encode_comp_t(nsec_to_AHZ(pacct->ac_utime));
+ ac->ac_stime = encode_comp_t(nsec_to_AHZ(pacct->ac_stime));
ac->ac_flag = pacct->ac_flag;
ac->ac_mem = encode_comp_t(pacct->ac_mem);
ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
@@ -530,7 +530,7 @@ out:
void acct_collect(long exitcode, int group_dead)
{
struct pacct_struct *pacct = &current->signal->pacct;
- cputime_t utime, stime;
+ u64 utime, stime;
unsigned long vsize = 0;
if (group_dead && current->mm) {
@@ -559,6 +559,7 @@ void acct_collect(long exitcode, int group_dead)
pacct->ac_flag |= ACORE;
if (current->flags & PF_SIGNALED)
pacct->ac_flag |= AXSIG;
+
task_cputime(current, &utime, &stime);
pacct->ac_utime += utime;
pacct->ac_stime += stime;
diff --git a/kernel/audit.c b/kernel/audit.c
index 25dd70a588b2..e794544f5e63 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -125,7 +125,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
/* The netlink socket. */
static struct sock *audit_sock;
-static int audit_net_id;
+static unsigned int audit_net_id;
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -1007,6 +1007,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return err;
}
if (s.mask & AUDIT_STATUS_PID) {
+ /* NOTE: we are using task_tgid_vnr() below because
+ * the s.pid value is relative to the namespace
+ * of the caller; at present this doesn't matter
+ * much since you can really only run auditd
+ * from the initial pid namespace, but something
+ * to keep in mind if this changes */
int new_pid = s.pid;
pid_t requesting_pid = task_tgid_vnr(current);
@@ -1310,9 +1316,8 @@ static void __net_exit audit_net_exit(struct net *net)
auditd_reset();
mutex_unlock(&audit_cmd_mutex);
- RCU_INIT_POINTER(aunet->nlsk, NULL);
- synchronize_net();
netlink_kernel_release(sock);
+ aunet->nlsk = NULL;
}
static struct pernet_operations audit_net_ops __net_initdata = {
@@ -1896,7 +1901,7 @@ void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
* @call_panic: optional pointer to int that will be updated if secid fails
*/
void audit_log_name(struct audit_context *context, struct audit_names *n,
- struct path *path, int record_num, int *call_panic)
+ const struct path *path, int record_num, int *call_panic)
{
struct audit_buffer *ab;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
@@ -2059,7 +2064,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
" euid=%u suid=%u fsuid=%u"
" egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
task_ppid_nr(tsk),
- task_pid_nr(tsk),
+ task_tgid_nr(tsk),
from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
from_kuid(&init_user_ns, cred->uid),
from_kgid(&init_user_ns, cred->gid),
@@ -2084,7 +2089,7 @@ EXPORT_SYMBOL(audit_log_task_info);
* @operation: specific link operation
* @link: the path that triggered the restriction
*/
-void audit_log_link_denied(const char *operation, struct path *link)
+void audit_log_link_denied(const char *operation, const struct path *link)
{
struct audit_buffer *ab;
struct audit_names *name;
diff --git a/kernel/audit.h b/kernel/audit.h
index 144b7ebd2deb..ca579880303a 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -215,7 +215,7 @@ extern void audit_copy_inode(struct audit_names *name,
extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
kernel_cap_t *cap);
extern void audit_log_name(struct audit_context *context,
- struct audit_names *n, struct path *path,
+ struct audit_names *n, const struct path *path,
int record_num, int *call_panic);
extern int audit_pid;
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index f75154889aa9..7ea57e516029 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -74,7 +74,7 @@ int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_
}
static void audit_update_mark(struct audit_fsnotify_mark *audit_mark,
- struct inode *inode)
+ const struct inode *inode)
{
audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET;
audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET;
@@ -167,11 +167,11 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
struct inode *to_tell,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
- u32 mask, void *data, int data_type,
+ u32 mask, const void *data, int data_type,
const unsigned char *dname, u32 cookie)
{
struct audit_fsnotify_mark *audit_mark;
- struct inode *inode = NULL;
+ const struct inode *inode = NULL;
audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
@@ -179,10 +179,10 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
switch (data_type) {
case (FSNOTIFY_EVENT_PATH):
- inode = ((struct path *)data)->dentry->d_inode;
+ inode = ((const struct path *)data)->dentry->d_inode;
break;
case (FSNOTIFY_EVENT_INODE):
- inode = (struct inode *)data;
+ inode = (const struct inode *)data;
break;
default:
BUG();
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 055f11b0a50f..7b44195da81b 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -231,9 +231,11 @@ static void untag_chunk(struct node *p)
if (size)
new = alloc_chunk(size);
+ mutex_lock(&entry->group->mark_mutex);
spin_lock(&entry->lock);
if (chunk->dead || !entry->inode) {
spin_unlock(&entry->lock);
+ mutex_unlock(&entry->group->mark_mutex);
if (new)
free_chunk(new);
goto out;
@@ -251,6 +253,7 @@ static void untag_chunk(struct node *p)
list_del_rcu(&chunk->hash);
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
+ mutex_unlock(&entry->group->mark_mutex);
fsnotify_destroy_mark(entry, audit_tree_group);
goto out;
}
@@ -258,8 +261,8 @@ static void untag_chunk(struct node *p)
if (!new)
goto Fallback;
- fsnotify_duplicate_mark(&new->mark, entry);
- if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) {
+ if (fsnotify_add_mark_locked(&new->mark, entry->group, entry->inode,
+ NULL, 1)) {
fsnotify_put_mark(&new->mark);
goto Fallback;
}
@@ -293,6 +296,7 @@ static void untag_chunk(struct node *p)
owner->root = new;
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
+ mutex_unlock(&entry->group->mark_mutex);
fsnotify_destroy_mark(entry, audit_tree_group);
fsnotify_put_mark(&new->mark); /* drop initial reference */
goto out;
@@ -309,6 +313,7 @@ Fallback:
put_tree(owner);
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
+ mutex_unlock(&entry->group->mark_mutex);
out:
fsnotify_put_mark(entry);
spin_lock(&hash_lock);
@@ -386,18 +391,21 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk_entry = &chunk->mark;
+ mutex_lock(&old_entry->group->mark_mutex);
spin_lock(&old_entry->lock);
if (!old_entry->inode) {
/* old_entry is being shot, lets just lie */
spin_unlock(&old_entry->lock);
+ mutex_unlock(&old_entry->group->mark_mutex);
fsnotify_put_mark(old_entry);
free_chunk(chunk);
return -ENOENT;
}
- fsnotify_duplicate_mark(chunk_entry, old_entry);
- if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) {
+ if (fsnotify_add_mark_locked(chunk_entry, old_entry->group,
+ old_entry->inode, NULL, 1)) {
spin_unlock(&old_entry->lock);
+ mutex_unlock(&old_entry->group->mark_mutex);
fsnotify_put_mark(chunk_entry);
fsnotify_put_mark(old_entry);
return -ENOSPC;
@@ -413,6 +421,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk->dead = 1;
spin_unlock(&chunk_entry->lock);
spin_unlock(&old_entry->lock);
+ mutex_unlock(&old_entry->group->mark_mutex);
fsnotify_destroy_mark(chunk_entry, audit_tree_group);
@@ -445,6 +454,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
spin_unlock(&hash_lock);
spin_unlock(&chunk_entry->lock);
spin_unlock(&old_entry->lock);
+ mutex_unlock(&old_entry->group->mark_mutex);
fsnotify_destroy_mark(old_entry, audit_tree_group);
fsnotify_put_mark(chunk_entry); /* drop initial reference */
fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
@@ -947,7 +957,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group,
struct inode *to_tell,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
- u32 mask, void *data, int data_type,
+ u32 mask, const void *data, int data_type,
const unsigned char *file_name, u32 cookie)
{
return 0;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 686e068ec3da..f79e4658433d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -471,10 +471,10 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
struct inode *to_tell,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
- u32 mask, void *data, int data_type,
+ u32 mask, const void *data, int data_type,
const unsigned char *dname, u32 cookie)
{
- struct inode *inode;
+ const struct inode *inode;
struct audit_parent *parent;
parent = container_of(inode_mark, struct audit_parent, mark);
@@ -483,10 +483,10 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
switch (data_type) {
case (FSNOTIFY_EVENT_PATH):
- inode = d_backing_inode(((struct path *)data)->dentry);
+ inode = d_backing_inode(((const struct path *)data)->dentry);
break;
case (FSNOTIFY_EVENT_INODE):
- inode = (struct inode *)data;
+ inode = (const struct inode *)data;
break;
default:
BUG();
@@ -547,8 +547,8 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
exe_file = get_task_exe_file(tsk);
if (!exe_file)
return 0;
- ino = exe_file->f_inode->i_ino;
- dev = exe_file->f_inode->i_sb->s_dev;
+ ino = file_inode(exe_file)->i_ino;
+ dev = file_inode(exe_file)->i_sb->s_dev;
fput(exe_file);
return audit_mark_compare(mark, ino, dev);
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4db32e8669f8..d6a8de5f8fa3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -458,7 +458,7 @@ static int audit_filter_rules(struct task_struct *tsk,
switch (f->type) {
case AUDIT_PID:
- pid = task_pid_nr(tsk);
+ pid = task_tgid_nr(tsk);
result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_PPID:
@@ -2003,7 +2003,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
loginuid = from_kuid(&init_user_ns, kloginuid),
tty = audit_get_tty(current);
- audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
+ audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
audit_log_task_context(ab);
audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
@@ -2233,7 +2233,7 @@ void __audit_ptrace(struct task_struct *t)
{
struct audit_context *context = current->audit_context;
- context->target_pid = task_pid_nr(t);
+ context->target_pid = task_tgid_nr(t);
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
@@ -2258,7 +2258,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
if (audit_pid && t->tgid == audit_pid) {
if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
- audit_sig_pid = task_pid_nr(tsk);
+ audit_sig_pid = task_tgid_nr(tsk);
if (uid_valid(tsk->loginuid))
audit_sig_uid = tsk->loginuid;
else
@@ -2358,7 +2358,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
void __audit_log_capset(const struct cred *new, const struct cred *old)
{
struct audit_context *context = current->audit_context;
- context->capset.pid = task_pid_nr(current);
+ context->capset.pid = task_tgid_nr(current);
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
context->capset.cap.permitted = new->cap_permitted;
@@ -2399,7 +2399,7 @@ static void audit_log_task(struct audit_buffer *ab)
from_kgid(&init_user_ns, gid),
sessionid);
audit_log_task_context(ab);
- audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
+ audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current));
audit_log_untrustedstring(ab, get_task_comm(comm, current));
audit_log_d_path_exe(ab, current->mm);
}
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index eed911d091da..1276474ac3cd 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,8 @@
obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
+obj-$(CONFIG_CGROUP_BPF) += cgroup.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 633a650d7aeb..3d55d95dcf49 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -11,7 +11,6 @@
*/
#include <linux/bpf.h>
#include <linux/err.h>
-#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/filter.h>
@@ -56,7 +55,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
attr->value_size == 0 || attr->map_flags)
return ERR_PTR(-EINVAL);
- if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1))
+ if (attr->value_size > KMALLOC_MAX_SIZE)
/* if value_size is bigger, the user space won't be able to
* access the elements.
*/
@@ -74,14 +73,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
if (array_size >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-ENOMEM);
-
/* allocate all map elements and zero-initialize them */
- array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
- if (!array) {
- array = vzalloc(array_size);
- if (!array)
- return ERR_PTR(-ENOMEM);
- }
+ array = bpf_map_area_alloc(array_size);
+ if (!array)
+ return ERR_PTR(-ENOMEM);
/* copy mandatory map attributes */
array->map.map_type = attr->map_type;
@@ -97,7 +92,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
if (array_size >= U32_MAX - PAGE_SIZE ||
elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
- kvfree(array);
+ bpf_map_area_free(array);
return ERR_PTR(-ENOMEM);
}
out:
@@ -262,7 +257,7 @@ static void array_map_free(struct bpf_map *map)
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
bpf_array_free_percpu(array);
- kvfree(array);
+ bpf_map_area_free(array);
}
static const struct bpf_map_ops array_ops = {
@@ -319,7 +314,8 @@ static void fd_array_map_free(struct bpf_map *map)
/* make sure it's empty */
for (i = 0; i < array->map.max_entries; i++)
BUG_ON(array->ptrs[i] != NULL);
- kvfree(array);
+
+ bpf_map_area_free(array);
}
static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
@@ -538,7 +534,7 @@ static int __init register_perf_event_array_map(void)
}
late_initcall(register_perf_event_array_map);
-#ifdef CONFIG_SOCK_CGROUP_DATA
+#ifdef CONFIG_CGROUPS
static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file /* not used */,
int fd)
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
new file mode 100644
index 000000000000..89b7ef41c86b
--- /dev/null
+++ b/kernel/bpf/bpf_lru_list.c
@@ -0,0 +1,695 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+
+#include "bpf_lru_list.h"
+
+#define LOCAL_FREE_TARGET (128)
+#define LOCAL_NR_SCANS LOCAL_FREE_TARGET
+
+#define PERCPU_FREE_TARGET (16)
+#define PERCPU_NR_SCANS PERCPU_FREE_TARGET
+
+/* Helpers to get the local list index */
+#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET)
+#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE)
+#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING)
+#define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET)
+
+static int get_next_cpu(int cpu)
+{
+ cpu = cpumask_next(cpu, cpu_possible_mask);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_first(cpu_possible_mask);
+ return cpu;
+}
+
+/* Local list helpers */
+static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l)
+{
+ return &loc_l->lists[LOCAL_FREE_LIST_IDX];
+}
+
+static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
+{
+ return &loc_l->lists[LOCAL_PENDING_LIST_IDX];
+}
+
+/* bpf_lru_node helpers */
+static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
+{
+ return node->ref;
+}
+
+static void bpf_lru_list_count_inc(struct bpf_lru_list *l,
+ enum bpf_lru_list_type type)
+{
+ if (type < NR_BPF_LRU_LIST_COUNT)
+ l->counts[type]++;
+}
+
+static void bpf_lru_list_count_dec(struct bpf_lru_list *l,
+ enum bpf_lru_list_type type)
+{
+ if (type < NR_BPF_LRU_LIST_COUNT)
+ l->counts[type]--;
+}
+
+static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l,
+ struct bpf_lru_node *node,
+ struct list_head *free_list,
+ enum bpf_lru_list_type tgt_free_type)
+{
+ if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
+ return;
+
+ /* If the removing node is the next_inactive_rotation candidate,
+ * move the next_inactive_rotation pointer also.
+ */
+ if (&node->list == l->next_inactive_rotation)
+ l->next_inactive_rotation = l->next_inactive_rotation->prev;
+
+ bpf_lru_list_count_dec(l, node->type);
+
+ node->type = tgt_free_type;
+ list_move(&node->list, free_list);
+}
+
+/* Move nodes from local list to the LRU list */
+static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
+ struct bpf_lru_node *node,
+ enum bpf_lru_list_type tgt_type)
+{
+ if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) ||
+ WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
+ return;
+
+ bpf_lru_list_count_inc(l, tgt_type);
+ node->type = tgt_type;
+ node->ref = 0;
+ list_move(&node->list, &l->lists[tgt_type]);
+}
+
+/* Move nodes between or within active and inactive list (like
+ * active to inactive, inactive to active or tail of active back to
+ * the head of active).
+ */
+static void __bpf_lru_node_move(struct bpf_lru_list *l,
+ struct bpf_lru_node *node,
+ enum bpf_lru_list_type tgt_type)
+{
+ if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) ||
+ WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
+ return;
+
+ if (node->type != tgt_type) {
+ bpf_lru_list_count_dec(l, node->type);
+ bpf_lru_list_count_inc(l, tgt_type);
+ node->type = tgt_type;
+ }
+ node->ref = 0;
+
+ /* If the moving node is the next_inactive_rotation candidate,
+ * move the next_inactive_rotation pointer also.
+ */
+ if (&node->list == l->next_inactive_rotation)
+ l->next_inactive_rotation = l->next_inactive_rotation->prev;
+
+ list_move(&node->list, &l->lists[tgt_type]);
+}
+
+static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l)
+{
+ return l->counts[BPF_LRU_LIST_T_INACTIVE] <
+ l->counts[BPF_LRU_LIST_T_ACTIVE];
+}
+
+/* Rotate the active list:
+ * 1. Start from tail
+ * 2. If the node has the ref bit set, it will be rotated
+ * back to the head of active list with the ref bit cleared.
+ * Give this node one more chance to survive in the active list.
+ * 3. If the ref bit is not set, move it to the head of the
+ * inactive list.
+ * 4. It will at most scan nr_scans nodes
+ */
+static void __bpf_lru_list_rotate_active(struct bpf_lru *lru,
+ struct bpf_lru_list *l)
+{
+ struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE];
+ struct bpf_lru_node *node, *tmp_node, *first_node;
+ unsigned int i = 0;
+
+ first_node = list_first_entry(active, struct bpf_lru_node, list);
+ list_for_each_entry_safe_reverse(node, tmp_node, active, list) {
+ if (bpf_lru_node_is_ref(node))
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
+ else
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
+
+ if (++i == lru->nr_scans || node == first_node)
+ break;
+ }
+}
+
+/* Rotate the inactive list. It starts from the next_inactive_rotation
+ * 1. If the node has ref bit set, it will be moved to the head
+ * of active list with the ref bit cleared.
+ * 2. If the node does not have ref bit set, it will leave it
+ * at its current location (i.e. do nothing) so that it can
+ * be considered during the next inactive_shrink.
+ * 3. It will at most scan nr_scans nodes
+ */
+static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru,
+ struct bpf_lru_list *l)
+{
+ struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+ struct list_head *cur, *last, *next = inactive;
+ struct bpf_lru_node *node;
+ unsigned int i = 0;
+
+ if (list_empty(inactive))
+ return;
+
+ last = l->next_inactive_rotation->next;
+ if (last == inactive)
+ last = last->next;
+
+ cur = l->next_inactive_rotation;
+ while (i < lru->nr_scans) {
+ if (cur == inactive) {
+ cur = cur->prev;
+ continue;
+ }
+
+ node = list_entry(cur, struct bpf_lru_node, list);
+ next = cur->prev;
+ if (bpf_lru_node_is_ref(node))
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
+ if (cur == last)
+ break;
+ cur = next;
+ i++;
+ }
+
+ l->next_inactive_rotation = next;
+}
+
+/* Shrink the inactive list. It starts from the tail of the
+ * inactive list and only move the nodes without the ref bit
+ * set to the designated free list.
+ */
+static unsigned int
+__bpf_lru_list_shrink_inactive(struct bpf_lru *lru,
+ struct bpf_lru_list *l,
+ unsigned int tgt_nshrink,
+ struct list_head *free_list,
+ enum bpf_lru_list_type tgt_free_type)
+{
+ struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+ struct bpf_lru_node *node, *tmp_node, *first_node;
+ unsigned int nshrinked = 0;
+ unsigned int i = 0;
+
+ first_node = list_first_entry(inactive, struct bpf_lru_node, list);
+ list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) {
+ if (bpf_lru_node_is_ref(node)) {
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
+ } else if (lru->del_from_htab(lru->del_arg, node)) {
+ __bpf_lru_node_move_to_free(l, node, free_list,
+ tgt_free_type);
+ if (++nshrinked == tgt_nshrink)
+ break;
+ }
+
+ if (++i == lru->nr_scans)
+ break;
+ }
+
+ return nshrinked;
+}
+
+/* 1. Rotate the active list (if needed)
+ * 2. Always rotate the inactive list
+ */
+static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l)
+{
+ if (bpf_lru_list_inactive_low(l))
+ __bpf_lru_list_rotate_active(lru, l);
+
+ __bpf_lru_list_rotate_inactive(lru, l);
+}
+
+/* Calls __bpf_lru_list_shrink_inactive() to shrink some
+ * ref-bit-cleared nodes and move them to the designated
+ * free list.
+ *
+ * If it cannot get a free node after calling
+ * __bpf_lru_list_shrink_inactive(). It will just remove
+ * one node from either inactive or active list without
+ * honoring the ref-bit. It prefers inactive list to active
+ * list in this situation.
+ */
+static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru,
+ struct bpf_lru_list *l,
+ unsigned int tgt_nshrink,
+ struct list_head *free_list,
+ enum bpf_lru_list_type tgt_free_type)
+
+{
+ struct bpf_lru_node *node, *tmp_node;
+ struct list_head *force_shrink_list;
+ unsigned int nshrinked;
+
+ nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink,
+ free_list, tgt_free_type);
+ if (nshrinked)
+ return nshrinked;
+
+ /* Do a force shrink by ignoring the reference bit */
+ if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE]))
+ force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+ else
+ force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE];
+
+ list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list,
+ list) {
+ if (lru->del_from_htab(lru->del_arg, node)) {
+ __bpf_lru_node_move_to_free(l, node, free_list,
+ tgt_free_type);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/* Flush the nodes from the local pending list to the LRU list */
+static void __local_list_flush(struct bpf_lru_list *l,
+ struct bpf_lru_locallist *loc_l)
+{
+ struct bpf_lru_node *node, *tmp_node;
+
+ list_for_each_entry_safe_reverse(node, tmp_node,
+ local_pending_list(loc_l), list) {
+ if (bpf_lru_node_is_ref(node))
+ __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE);
+ else
+ __bpf_lru_node_move_in(l, node,
+ BPF_LRU_LIST_T_INACTIVE);
+ }
+}
+
+static void bpf_lru_list_push_free(struct bpf_lru_list *l,
+ struct bpf_lru_node *node)
+{
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
+ return;
+
+ raw_spin_lock_irqsave(&l->lock, flags);
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
+ raw_spin_unlock_irqrestore(&l->lock, flags);
+}
+
+static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
+ struct bpf_lru_locallist *loc_l)
+{
+ struct bpf_lru_list *l = &lru->common_lru.lru_list;
+ struct bpf_lru_node *node, *tmp_node;
+ unsigned int nfree = 0;
+
+ raw_spin_lock(&l->lock);
+
+ __local_list_flush(l, loc_l);
+
+ __bpf_lru_list_rotate(lru, l);
+
+ list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE],
+ list) {
+ __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
+ BPF_LRU_LOCAL_LIST_T_FREE);
+ if (++nfree == LOCAL_FREE_TARGET)
+ break;
+ }
+
+ if (nfree < LOCAL_FREE_TARGET)
+ __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree,
+ local_free_list(loc_l),
+ BPF_LRU_LOCAL_LIST_T_FREE);
+
+ raw_spin_unlock(&l->lock);
+}
+
+static void __local_list_add_pending(struct bpf_lru *lru,
+ struct bpf_lru_locallist *loc_l,
+ int cpu,
+ struct bpf_lru_node *node,
+ u32 hash)
+{
+ *(u32 *)((void *)node + lru->hash_offset) = hash;
+ node->cpu = cpu;
+ node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
+ node->ref = 0;
+ list_add(&node->list, local_pending_list(loc_l));
+}
+
+struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l)
+{
+ struct bpf_lru_node *node;
+
+ node = list_first_entry_or_null(local_free_list(loc_l),
+ struct bpf_lru_node,
+ list);
+ if (node)
+ list_del(&node->list);
+
+ return node;
+}
+
+struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru,
+ struct bpf_lru_locallist *loc_l)
+{
+ struct bpf_lru_node *node;
+ bool force = false;
+
+ignore_ref:
+ /* Get from the tail (i.e. older element) of the pending list. */
+ list_for_each_entry_reverse(node, local_pending_list(loc_l),
+ list) {
+ if ((!bpf_lru_node_is_ref(node) || force) &&
+ lru->del_from_htab(lru->del_arg, node)) {
+ list_del(&node->list);
+ return node;
+ }
+ }
+
+ if (!force) {
+ force = true;
+ goto ignore_ref;
+ }
+
+ return NULL;
+}
+
+static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
+ u32 hash)
+{
+ struct list_head *free_list;
+ struct bpf_lru_node *node = NULL;
+ struct bpf_lru_list *l;
+ unsigned long flags;
+ int cpu = raw_smp_processor_id();
+
+ l = per_cpu_ptr(lru->percpu_lru, cpu);
+
+ raw_spin_lock_irqsave(&l->lock, flags);
+
+ __bpf_lru_list_rotate(lru, l);
+
+ free_list = &l->lists[BPF_LRU_LIST_T_FREE];
+ if (list_empty(free_list))
+ __bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list,
+ BPF_LRU_LIST_T_FREE);
+
+ if (!list_empty(free_list)) {
+ node = list_first_entry(free_list, struct bpf_lru_node, list);
+ *(u32 *)((void *)node + lru->hash_offset) = hash;
+ node->ref = 0;
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
+ }
+
+ raw_spin_unlock_irqrestore(&l->lock, flags);
+
+ return node;
+}
+
+static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
+ u32 hash)
+{
+ struct bpf_lru_locallist *loc_l, *steal_loc_l;
+ struct bpf_common_lru *clru = &lru->common_lru;
+ struct bpf_lru_node *node;
+ int steal, first_steal;
+ unsigned long flags;
+ int cpu = raw_smp_processor_id();
+
+ loc_l = per_cpu_ptr(clru->local_list, cpu);
+
+ raw_spin_lock_irqsave(&loc_l->lock, flags);
+
+ node = __local_list_pop_free(loc_l);
+ if (!node) {
+ bpf_lru_list_pop_free_to_local(lru, loc_l);
+ node = __local_list_pop_free(loc_l);
+ }
+
+ if (node)
+ __local_list_add_pending(lru, loc_l, cpu, node, hash);
+
+ raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+
+ if (node)
+ return node;
+
+ /* No free nodes found from the local free list and
+ * the global LRU list.
+ *
+ * Steal from the local free/pending list of the
+ * current CPU and remote CPU in RR. It starts
+ * with the loc_l->next_steal CPU.
+ */
+
+ first_steal = loc_l->next_steal;
+ steal = first_steal;
+ do {
+ steal_loc_l = per_cpu_ptr(clru->local_list, steal);
+
+ raw_spin_lock_irqsave(&steal_loc_l->lock, flags);
+
+ node = __local_list_pop_free(steal_loc_l);
+ if (!node)
+ node = __local_list_pop_pending(lru, steal_loc_l);
+
+ raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
+
+ steal = get_next_cpu(steal);
+ } while (!node && steal != first_steal);
+
+ loc_l->next_steal = steal;
+
+ if (node) {
+ raw_spin_lock_irqsave(&loc_l->lock, flags);
+ __local_list_add_pending(lru, loc_l, cpu, node, hash);
+ raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ }
+
+ return node;
+}
+
+struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash)
+{
+ if (lru->percpu)
+ return bpf_percpu_lru_pop_free(lru, hash);
+ else
+ return bpf_common_lru_pop_free(lru, hash);
+}
+
+static void bpf_common_lru_push_free(struct bpf_lru *lru,
+ struct bpf_lru_node *node)
+{
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) ||
+ WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE))
+ return;
+
+ if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) {
+ struct bpf_lru_locallist *loc_l;
+
+ loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
+
+ raw_spin_lock_irqsave(&loc_l->lock, flags);
+
+ if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) {
+ raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ goto check_lru_list;
+ }
+
+ node->type = BPF_LRU_LOCAL_LIST_T_FREE;
+ node->ref = 0;
+ list_move(&node->list, local_free_list(loc_l));
+
+ raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ return;
+ }
+
+check_lru_list:
+ bpf_lru_list_push_free(&lru->common_lru.lru_list, node);
+}
+
+static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
+ struct bpf_lru_node *node)
+{
+ struct bpf_lru_list *l;
+ unsigned long flags;
+
+ l = per_cpu_ptr(lru->percpu_lru, node->cpu);
+
+ raw_spin_lock_irqsave(&l->lock, flags);
+
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
+
+ raw_spin_unlock_irqrestore(&l->lock, flags);
+}
+
+void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
+{
+ if (lru->percpu)
+ bpf_percpu_lru_push_free(lru, node);
+ else
+ bpf_common_lru_push_free(lru, node);
+}
+
+void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+ u32 elem_size, u32 nr_elems)
+{
+ struct bpf_lru_list *l = &lru->common_lru.lru_list;
+ u32 i;
+
+ for (i = 0; i < nr_elems; i++) {
+ struct bpf_lru_node *node;
+
+ node = (struct bpf_lru_node *)(buf + node_offset);
+ node->type = BPF_LRU_LIST_T_FREE;
+ node->ref = 0;
+ list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
+ buf += elem_size;
+ }
+}
+
+void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+ u32 elem_size, u32 nr_elems)
+{
+ u32 i, pcpu_entries;
+ int cpu;
+ struct bpf_lru_list *l;
+
+ pcpu_entries = nr_elems / num_possible_cpus();
+
+ i = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_lru_node *node;
+
+ l = per_cpu_ptr(lru->percpu_lru, cpu);
+again:
+ node = (struct bpf_lru_node *)(buf + node_offset);
+ node->cpu = cpu;
+ node->type = BPF_LRU_LIST_T_FREE;
+ node->ref = 0;
+ list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
+ i++;
+ buf += elem_size;
+ if (i == nr_elems)
+ break;
+ if (i % pcpu_entries)
+ goto again;
+ }
+}
+
+void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+ u32 elem_size, u32 nr_elems)
+{
+ if (lru->percpu)
+ bpf_percpu_lru_populate(lru, buf, node_offset, elem_size,
+ nr_elems);
+ else
+ bpf_common_lru_populate(lru, buf, node_offset, elem_size,
+ nr_elems);
+}
+
+static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
+{
+ int i;
+
+ for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++)
+ INIT_LIST_HEAD(&loc_l->lists[i]);
+
+ loc_l->next_steal = cpu;
+
+ raw_spin_lock_init(&loc_l->lock);
+}
+
+static void bpf_lru_list_init(struct bpf_lru_list *l)
+{
+ int i;
+
+ for (i = 0; i < NR_BPF_LRU_LIST_T; i++)
+ INIT_LIST_HEAD(&l->lists[i]);
+
+ for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++)
+ l->counts[i] = 0;
+
+ l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+
+ raw_spin_lock_init(&l->lock);
+}
+
+int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
+ del_from_htab_func del_from_htab, void *del_arg)
+{
+ int cpu;
+
+ if (percpu) {
+ lru->percpu_lru = alloc_percpu(struct bpf_lru_list);
+ if (!lru->percpu_lru)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_lru_list *l;
+
+ l = per_cpu_ptr(lru->percpu_lru, cpu);
+ bpf_lru_list_init(l);
+ }
+ lru->nr_scans = PERCPU_NR_SCANS;
+ } else {
+ struct bpf_common_lru *clru = &lru->common_lru;
+
+ clru->local_list = alloc_percpu(struct bpf_lru_locallist);
+ if (!clru->local_list)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_lru_locallist *loc_l;
+
+ loc_l = per_cpu_ptr(clru->local_list, cpu);
+ bpf_lru_locallist_init(loc_l, cpu);
+ }
+
+ bpf_lru_list_init(&clru->lru_list);
+ lru->nr_scans = LOCAL_NR_SCANS;
+ }
+
+ lru->percpu = percpu;
+ lru->del_from_htab = del_from_htab;
+ lru->del_arg = del_arg;
+ lru->hash_offset = hash_offset;
+
+ return 0;
+}
+
+void bpf_lru_destroy(struct bpf_lru *lru)
+{
+ if (lru->percpu)
+ free_percpu(lru->percpu_lru);
+ else
+ free_percpu(lru->common_lru.local_list);
+}
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
new file mode 100644
index 000000000000..5c35a98d02bf
--- /dev/null
+++ b/kernel/bpf/bpf_lru_list.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef __BPF_LRU_LIST_H_
+#define __BPF_LRU_LIST_H_
+
+#include <linux/list.h>
+#include <linux/spinlock_types.h>
+
+#define NR_BPF_LRU_LIST_T (3)
+#define NR_BPF_LRU_LIST_COUNT (2)
+#define NR_BPF_LRU_LOCAL_LIST_T (2)
+#define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T
+
+enum bpf_lru_list_type {
+ BPF_LRU_LIST_T_ACTIVE,
+ BPF_LRU_LIST_T_INACTIVE,
+ BPF_LRU_LIST_T_FREE,
+ BPF_LRU_LOCAL_LIST_T_FREE,
+ BPF_LRU_LOCAL_LIST_T_PENDING,
+};
+
+struct bpf_lru_node {
+ struct list_head list;
+ u16 cpu;
+ u8 type;
+ u8 ref;
+};
+
+struct bpf_lru_list {
+ struct list_head lists[NR_BPF_LRU_LIST_T];
+ unsigned int counts[NR_BPF_LRU_LIST_COUNT];
+ /* The next inacitve list rotation starts from here */
+ struct list_head *next_inactive_rotation;
+
+ raw_spinlock_t lock ____cacheline_aligned_in_smp;
+};
+
+struct bpf_lru_locallist {
+ struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T];
+ u16 next_steal;
+ raw_spinlock_t lock;
+};
+
+struct bpf_common_lru {
+ struct bpf_lru_list lru_list;
+ struct bpf_lru_locallist __percpu *local_list;
+};
+
+typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node);
+
+struct bpf_lru {
+ union {
+ struct bpf_common_lru common_lru;
+ struct bpf_lru_list __percpu *percpu_lru;
+ };
+ del_from_htab_func del_from_htab;
+ void *del_arg;
+ unsigned int hash_offset;
+ unsigned int nr_scans;
+ bool percpu;
+};
+
+static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
+{
+ /* ref is an approximation on access frequency. It does not
+ * have to be very accurate. Hence, no protection is used.
+ */
+ node->ref = 1;
+}
+
+int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
+ del_from_htab_func del_from_htab, void *delete_arg);
+void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+ u32 elem_size, u32 nr_elems);
+void bpf_lru_destroy(struct bpf_lru *lru);
+struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash);
+void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node);
+void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node);
+
+#endif
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
new file mode 100644
index 000000000000..da0f53690295
--- /dev/null
+++ b/kernel/bpf/cgroup.c
@@ -0,0 +1,235 @@
+/*
+ * Functions to manage eBPF programs attached to cgroups
+ *
+ * Copyright (c) 2016 Daniel Mack
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
+#include <net/sock.h>
+
+DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
+EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+
+/**
+ * cgroup_bpf_put() - put references of all bpf programs
+ * @cgrp: the cgroup to modify
+ */
+void cgroup_bpf_put(struct cgroup *cgrp)
+{
+ unsigned int type;
+
+ for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
+ struct bpf_prog *prog = cgrp->bpf.prog[type];
+
+ if (prog) {
+ bpf_prog_put(prog);
+ static_branch_dec(&cgroup_bpf_enabled_key);
+ }
+ }
+}
+
+/**
+ * cgroup_bpf_inherit() - inherit effective programs from parent
+ * @cgrp: the cgroup to modify
+ * @parent: the parent to inherit from
+ */
+void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+{
+ unsigned int type;
+
+ for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
+ struct bpf_prog *e;
+
+ e = rcu_dereference_protected(parent->bpf.effective[type],
+ lockdep_is_held(&cgroup_mutex));
+ rcu_assign_pointer(cgrp->bpf.effective[type], e);
+ cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
+ }
+}
+
+/**
+ * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ * propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
+ * @prog: A new program to pin
+ * @type: Type of pinning operation (ingress/egress)
+ *
+ * Each cgroup has a set of two pointers for bpf programs; one for eBPF
+ * programs it owns, and which is effective for execution.
+ *
+ * If @prog is not %NULL, this function attaches a new program to the cgroup
+ * and releases the one that is currently attached, if any. @prog is then made
+ * the effective program of type @type in that cgroup.
+ *
+ * If @prog is %NULL, the currently attached program of type @type is released,
+ * and the effective program of the parent cgroup (if any) is inherited to
+ * @cgrp.
+ *
+ * Then, the descendants of @cgrp are walked and the effective program for
+ * each of them is set to the effective program of @cgrp unless the
+ * descendant has its own program attached, in which case the subbranch is
+ * skipped. This ensures that delegated subcgroups with own programs are left
+ * untouched.
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
+ struct bpf_prog *prog, enum bpf_attach_type type,
+ bool new_overridable)
+{
+ struct bpf_prog *old_prog, *effective = NULL;
+ struct cgroup_subsys_state *pos;
+ bool overridable = true;
+
+ if (parent) {
+ overridable = !parent->bpf.disallow_override[type];
+ effective = rcu_dereference_protected(parent->bpf.effective[type],
+ lockdep_is_held(&cgroup_mutex));
+ }
+
+ if (prog && effective && !overridable)
+ /* if parent has non-overridable prog attached, disallow
+ * attaching new programs to descendent cgroup
+ */
+ return -EPERM;
+
+ if (prog && effective && overridable != new_overridable)
+ /* if parent has overridable prog attached, only
+ * allow overridable programs in descendent cgroup
+ */
+ return -EPERM;
+
+ old_prog = cgrp->bpf.prog[type];
+
+ if (prog) {
+ overridable = new_overridable;
+ effective = prog;
+ if (old_prog &&
+ cgrp->bpf.disallow_override[type] == new_overridable)
+ /* disallow attaching non-overridable on top
+ * of existing overridable in this cgroup
+ * and vice versa
+ */
+ return -EPERM;
+ }
+
+ if (!prog && !old_prog)
+ /* report error when trying to detach and nothing is attached */
+ return -ENOENT;
+
+ cgrp->bpf.prog[type] = prog;
+
+ css_for_each_descendant_pre(pos, &cgrp->self) {
+ struct cgroup *desc = container_of(pos, struct cgroup, self);
+
+ /* skip the subtree if the descendant has its own program */
+ if (desc->bpf.prog[type] && desc != cgrp) {
+ pos = css_rightmost_descendant(pos);
+ } else {
+ rcu_assign_pointer(desc->bpf.effective[type],
+ effective);
+ desc->bpf.disallow_override[type] = !overridable;
+ }
+ }
+
+ if (prog)
+ static_branch_inc(&cgroup_bpf_enabled_key);
+
+ if (old_prog) {
+ bpf_prog_put(old_prog);
+ static_branch_dec(&cgroup_bpf_enabled_key);
+ }
+ return 0;
+}
+
+/**
+ * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
+ * @sk: The socken sending or receiving traffic
+ * @skb: The skb that is being sent or received
+ * @type: The type of program to be exectuted
+ *
+ * If no socket is passed, or the socket is not of type INET or INET6,
+ * this function does nothing and returns 0.
+ *
+ * The program type passed in via @type must be suitable for network
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_skb(struct sock *sk,
+ struct sk_buff *skb,
+ enum bpf_attach_type type)
+{
+ struct bpf_prog *prog;
+ struct cgroup *cgrp;
+ int ret = 0;
+
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
+ if (sk->sk_family != AF_INET &&
+ sk->sk_family != AF_INET6)
+ return 0;
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+
+ rcu_read_lock();
+
+ prog = rcu_dereference(cgrp->bpf.effective[type]);
+ if (prog) {
+ unsigned int offset = skb->data - skb_network_header(skb);
+
+ __skb_push(skb, offset);
+ ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
+ __skb_pull(skb, offset);
+ }
+
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
+
+/**
+ * __cgroup_bpf_run_filter_sk() - Run a program on a sock
+ * @sk: sock structure to manipulate
+ * @type: The type of program to be exectuted
+ *
+ * socket is passed is expected to be of type INET or INET6.
+ *
+ * The program type passed in via @type must be suitable for sock
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sk(struct sock *sk,
+ enum bpf_attach_type type)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_prog *prog;
+ int ret = 0;
+
+
+ rcu_read_lock();
+
+ prog = rcu_dereference(cgrp->bpf.effective[type]);
+ if (prog)
+ ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
+
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 03fd23d4d587..503d4211988a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -105,19 +105,29 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
gfp_extra_flags;
struct bpf_prog *fp;
+ u32 pages, delta;
+ int ret;
BUG_ON(fp_old == NULL);
size = round_up(size, PAGE_SIZE);
- if (size <= fp_old->pages * PAGE_SIZE)
+ pages = size / PAGE_SIZE;
+ if (pages <= fp_old->pages)
return fp_old;
+ delta = pages - fp_old->pages;
+ ret = __bpf_prog_charge(fp_old->aux->user, delta);
+ if (ret)
+ return NULL;
+
fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
- if (fp != NULL) {
+ if (fp == NULL) {
+ __bpf_prog_uncharge(fp_old->aux->user, delta);
+ } else {
kmemcheck_annotate_bitfield(fp, meta);
memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
- fp->pages = size / PAGE_SIZE;
+ fp->pages = pages;
fp->aux->prog = fp;
/* We keep fp->aux from fp_old around in the new
@@ -136,6 +146,78 @@ void __bpf_prog_free(struct bpf_prog *fp)
vfree(fp);
}
+int bpf_prog_calc_tag(struct bpf_prog *fp)
+{
+ const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64);
+ u32 raw_size = bpf_prog_tag_scratch_size(fp);
+ u32 digest[SHA_DIGEST_WORDS];
+ u32 ws[SHA_WORKSPACE_WORDS];
+ u32 i, bsize, psize, blocks;
+ struct bpf_insn *dst;
+ bool was_ld_map;
+ u8 *raw, *todo;
+ __be32 *result;
+ __be64 *bits;
+
+ raw = vmalloc(raw_size);
+ if (!raw)
+ return -ENOMEM;
+
+ sha_init(digest);
+ memset(ws, 0, sizeof(ws));
+
+ /* We need to take out the map fd for the digest calculation
+ * since they are unstable from user space side.
+ */
+ dst = (void *)raw;
+ for (i = 0, was_ld_map = false; i < fp->len; i++) {
+ dst[i] = fp->insnsi[i];
+ if (!was_ld_map &&
+ dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
+ dst[i].src_reg == BPF_PSEUDO_MAP_FD) {
+ was_ld_map = true;
+ dst[i].imm = 0;
+ } else if (was_ld_map &&
+ dst[i].code == 0 &&
+ dst[i].dst_reg == 0 &&
+ dst[i].src_reg == 0 &&
+ dst[i].off == 0) {
+ was_ld_map = false;
+ dst[i].imm = 0;
+ } else {
+ was_ld_map = false;
+ }
+ }
+
+ psize = bpf_prog_insn_size(fp);
+ memset(&raw[psize], 0, raw_size - psize);
+ raw[psize++] = 0x80;
+
+ bsize = round_up(psize, SHA_MESSAGE_BYTES);
+ blocks = bsize / SHA_MESSAGE_BYTES;
+ todo = raw;
+ if (bsize - psize >= sizeof(__be64)) {
+ bits = (__be64 *)(todo + bsize - sizeof(__be64));
+ } else {
+ bits = (__be64 *)(todo + bsize + bits_offset);
+ blocks++;
+ }
+ *bits = cpu_to_be64((psize - 1) << 3);
+
+ while (blocks--) {
+ sha_transform(digest, todo, ws);
+ todo += SHA_MESSAGE_BYTES;
+ }
+
+ result = (__force __be32 *)digest;
+ for (i = 0; i < SHA_DIGEST_WORDS; i++)
+ result[i] = cpu_to_be32(digest[i]);
+ memcpy(fp->tag, result, sizeof(fp->tag));
+
+ vfree(raw);
+ return 0;
+}
+
static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
{
return BPF_CLASS(insn->code) == BPF_JMP &&
@@ -1018,7 +1100,7 @@ void bpf_user_rnd_init_once(void)
prandom_init_once(&bpf_user_rnd_state);
}
-u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_user_rnd_u32)
{
/* Should someone ever have the rather unwise idea to use some
* of the registers passed into this function, then note that
@@ -1031,7 +1113,7 @@ u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
state = &get_cpu_var(bpf_user_rnd_state);
res = prandom_u32_state(state);
- put_cpu_var(state);
+ put_cpu_var(bpf_user_rnd_state);
return res;
}
@@ -1043,6 +1125,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
+const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
@@ -1077,7 +1160,7 @@ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
return prog;
}
-bool __weak bpf_helper_changes_skb_data(void *func)
+bool __weak bpf_helper_changes_pkt_data(void *func)
{
return false;
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 570eeca7bdfa..a753bbe7df0a 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -13,8 +13,8 @@
#include <linux/bpf.h>
#include <linux/jhash.h>
#include <linux/filter.h>
-#include <linux/vmalloc.h>
#include "percpu_freelist.h"
+#include "bpf_lru_list.h"
struct bucket {
struct hlist_head head;
@@ -25,7 +25,10 @@ struct bpf_htab {
struct bpf_map map;
struct bucket *buckets;
void *elems;
- struct pcpu_freelist freelist;
+ union {
+ struct pcpu_freelist freelist;
+ struct bpf_lru lru;
+ };
void __percpu *extra_elems;
atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
@@ -48,11 +51,26 @@ struct htab_elem {
union {
struct rcu_head rcu;
enum extra_elem_state state;
+ struct bpf_lru_node lru_node;
};
u32 hash;
char key[0] __aligned(8);
};
+static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
+
+static bool htab_is_lru(const struct bpf_htab *htab)
+{
+ return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
+ htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
+}
+
+static bool htab_is_percpu(const struct bpf_htab *htab)
+{
+ return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
+}
+
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr)
{
@@ -73,7 +91,7 @@ static void htab_free_elems(struct bpf_htab *htab)
{
int i;
- if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+ if (!htab_is_percpu(htab))
goto free_elems;
for (i = 0; i < htab->map.max_entries; i++) {
@@ -84,18 +102,34 @@ static void htab_free_elems(struct bpf_htab *htab)
free_percpu(pptr);
}
free_elems:
- vfree(htab->elems);
+ bpf_map_area_free(htab->elems);
+}
+
+static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
+ u32 hash)
+{
+ struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
+ struct htab_elem *l;
+
+ if (node) {
+ l = container_of(node, struct htab_elem, lru_node);
+ memcpy(l->key, key, htab->map.key_size);
+ return l;
+ }
+
+ return NULL;
}
-static int prealloc_elems_and_freelist(struct bpf_htab *htab)
+static int prealloc_init(struct bpf_htab *htab)
{
int err = -ENOMEM, i;
- htab->elems = vzalloc(htab->elem_size * htab->map.max_entries);
+ htab->elems = bpf_map_area_alloc(htab->elem_size *
+ htab->map.max_entries);
if (!htab->elems)
return -ENOMEM;
- if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+ if (!htab_is_percpu(htab))
goto skip_percpu_elems;
for (i = 0; i < htab->map.max_entries; i++) {
@@ -110,12 +144,27 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab)
}
skip_percpu_elems:
- err = pcpu_freelist_init(&htab->freelist);
+ if (htab_is_lru(htab))
+ err = bpf_lru_init(&htab->lru,
+ htab->map.map_flags & BPF_F_NO_COMMON_LRU,
+ offsetof(struct htab_elem, hash) -
+ offsetof(struct htab_elem, lru_node),
+ htab_lru_map_delete_node,
+ htab);
+ else
+ err = pcpu_freelist_init(&htab->freelist);
+
if (err)
goto free_elems;
- pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size,
- htab->map.max_entries);
+ if (htab_is_lru(htab))
+ bpf_lru_populate(&htab->lru, htab->elems,
+ offsetof(struct htab_elem, lru_node),
+ htab->elem_size, htab->map.max_entries);
+ else
+ pcpu_freelist_populate(&htab->freelist, htab->elems,
+ htab->elem_size, htab->map.max_entries);
+
return 0;
free_elems:
@@ -123,6 +172,16 @@ free_elems:
return err;
}
+static void prealloc_destroy(struct bpf_htab *htab)
+{
+ htab_free_elems(htab);
+
+ if (htab_is_lru(htab))
+ bpf_lru_destroy(&htab->lru);
+ else
+ pcpu_freelist_destroy(&htab->freelist);
+}
+
static int alloc_extra_elems(struct bpf_htab *htab)
{
void __percpu *pptr;
@@ -143,15 +202,37 @@ static int alloc_extra_elems(struct bpf_htab *htab)
/* Called from syscall */
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
- bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH;
+ bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+ bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
+ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+ /* percpu_lru means each cpu has its own LRU list.
+ * it is different from BPF_MAP_TYPE_PERCPU_HASH where
+ * the map's value itself is percpu. percpu_lru has
+ * nothing to do with the map's value.
+ */
+ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
+ bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
struct bpf_htab *htab;
int err, i;
u64 cost;
- if (attr->map_flags & ~BPF_F_NO_PREALLOC)
+ if (lru && !capable(CAP_SYS_ADMIN))
+ /* LRU implementation is much complicated than other
+ * maps. Hence, limit to CAP_SYS_ADMIN for now.
+ */
+ return ERR_PTR(-EPERM);
+
+ if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU))
/* reserved bits should not be used */
return ERR_PTR(-EINVAL);
+ if (!lru && percpu_lru)
+ return ERR_PTR(-EINVAL);
+
+ if (lru && !prealloc)
+ return ERR_PTR(-ENOTSUPP);
+
htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab)
return ERR_PTR(-ENOMEM);
@@ -171,6 +252,18 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab->map.value_size == 0)
goto free_htab;
+ if (percpu_lru) {
+ /* ensure each CPU's lru list has >=1 elements.
+ * since we are at it, make each lru list has the same
+ * number of elements.
+ */
+ htab->map.max_entries = roundup(attr->max_entries,
+ num_possible_cpus());
+ if (htab->map.max_entries < attr->max_entries)
+ htab->map.max_entries = rounddown(attr->max_entries,
+ num_possible_cpus());
+ }
+
/* hash table size must be power of 2 */
htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
@@ -181,7 +274,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
goto free_htab;
- if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) -
+ if (htab->map.value_size >= KMALLOC_MAX_SIZE -
MAX_BPF_STACK - sizeof(struct htab_elem))
/* if value_size is bigger, the user space won't be able to
* access the elements via bpf syscall. This check also makes
@@ -227,28 +320,27 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
goto free_htab;
err = -ENOMEM;
- htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
- GFP_USER | __GFP_NOWARN);
-
- if (!htab->buckets) {
- htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket));
- if (!htab->buckets)
- goto free_htab;
- }
+ htab->buckets = bpf_map_area_alloc(htab->n_buckets *
+ sizeof(struct bucket));
+ if (!htab->buckets)
+ goto free_htab;
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_HEAD(&htab->buckets[i].head);
raw_spin_lock_init(&htab->buckets[i].lock);
}
- if (!percpu) {
+ if (!percpu && !lru) {
+ /* lru itself can remove the least used element, so
+ * there is no need for an extra elem during map_update.
+ */
err = alloc_extra_elems(htab);
if (err)
goto free_buckets;
}
- if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
- err = prealloc_elems_and_freelist(htab);
+ if (prealloc) {
+ err = prealloc_init(htab);
if (err)
goto free_extra_elems;
}
@@ -258,7 +350,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
free_extra_elems:
free_percpu(htab->extra_elems);
free_buckets:
- kvfree(htab->buckets);
+ bpf_map_area_free(htab->buckets);
free_htab:
kfree(htab);
return ERR_PTR(err);
@@ -323,6 +415,46 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
+ if (l) {
+ bpf_lru_node_set_ref(&l->lru_node);
+ return l->key + round_up(map->key_size, 8);
+ }
+
+ return NULL;
+}
+
+/* It is called from the bpf_lru_list when the LRU needs to delete
+ * older elements from the htab.
+ */
+static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
+{
+ struct bpf_htab *htab = (struct bpf_htab *)arg;
+ struct htab_elem *l, *tgt_l;
+ struct hlist_head *head;
+ unsigned long flags;
+ struct bucket *b;
+
+ tgt_l = container_of(node, struct htab_elem, lru_node);
+ b = __select_bucket(htab, tgt_l->hash);
+ head = &b->head;
+
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ hlist_for_each_entry_rcu(l, head, hash_node)
+ if (l == tgt_l) {
+ hlist_del_rcu(&l->hash_node);
+ break;
+ }
+
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+
+ return l == tgt_l;
+}
+
/* Called from syscall */
static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
@@ -420,6 +552,24 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
}
}
+static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
+ void *value, bool onallcpus)
+{
+ if (!onallcpus) {
+ /* copy true value_size bytes */
+ memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+ } else {
+ u32 size = round_up(htab->map.value_size, 8);
+ int off = 0, cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
+ value + off, size);
+ off += size;
+ }
+ }
+}
+
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
@@ -479,18 +629,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
}
}
- if (!onallcpus) {
- /* copy true value_size bytes */
- memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
- } else {
- int off = 0, cpu;
+ pcpu_copy_value(htab, pptr, value, onallcpus);
- for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
- value + off, size);
- off += size;
- }
- }
if (!prealloc)
htab_elem_set_ptr(l_new, key_size, pptr);
} else {
@@ -571,6 +711,70 @@ err:
return ret;
}
+static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new, *l_old = NULL;
+ struct hlist_head *head;
+ unsigned long flags;
+ struct bucket *b;
+ u32 key_size, hash;
+ int ret;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ /* For LRU, we need to alloc before taking bucket's
+ * spinlock because getting free nodes from LRU may need
+ * to remove older elements from htab and this removal
+ * operation will need a bucket lock.
+ */
+ l_new = prealloc_lru_pop(htab, key, hash);
+ if (!l_new)
+ return -ENOMEM;
+ memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ l_old = lookup_elem_raw(head, hash, key, key_size);
+
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ goto err;
+
+ /* add new element to the head of the list, so that
+ * concurrent search will find it before old elem
+ */
+ hlist_add_head_rcu(&l_new->hash_node, head);
+ if (l_old) {
+ bpf_lru_node_set_ref(&l_new->lru_node);
+ hlist_del_rcu(&l_old->hash_node);
+ }
+ ret = 0;
+
+err:
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+
+ if (ret)
+ bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+ else if (l_old)
+ bpf_lru_push_free(&htab->lru, &l_old->lru_node);
+
+ return ret;
+}
+
static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags,
bool onallcpus)
@@ -606,22 +810,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
goto err;
if (l_old) {
- void __percpu *pptr = htab_elem_get_ptr(l_old, key_size);
- u32 size = htab->map.value_size;
-
/* per-cpu hash map can update value in-place */
- if (!onallcpus) {
- memcpy(this_cpu_ptr(pptr), value, size);
- } else {
- int off = 0, cpu;
-
- size = round_up(size, 8);
- for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
- value + off, size);
- off += size;
- }
- }
+ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
+ value, onallcpus);
} else {
l_new = alloc_htab_elem(htab, key, value, key_size,
hash, true, onallcpus, false);
@@ -637,12 +828,84 @@ err:
return ret;
}
+static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool onallcpus)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new = NULL, *l_old;
+ struct hlist_head *head;
+ unsigned long flags;
+ struct bucket *b;
+ u32 key_size, hash;
+ int ret;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ /* For LRU, we need to alloc before taking bucket's
+ * spinlock because LRU's elem alloc may need
+ * to remove older elem from htab and this removal
+ * operation will need a bucket lock.
+ */
+ if (map_flags != BPF_EXIST) {
+ l_new = prealloc_lru_pop(htab, key, hash);
+ if (!l_new)
+ return -ENOMEM;
+ }
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ l_old = lookup_elem_raw(head, hash, key, key_size);
+
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ goto err;
+
+ if (l_old) {
+ bpf_lru_node_set_ref(&l_old->lru_node);
+
+ /* per-cpu hash map can update value in-place */
+ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
+ value, onallcpus);
+ } else {
+ pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),
+ value, onallcpus);
+ hlist_add_head_rcu(&l_new->hash_node, head);
+ l_new = NULL;
+ }
+ ret = 0;
+err:
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ if (l_new)
+ bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+ return ret;
+}
+
static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
}
+static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
+ false);
+}
+
/* Called from syscall or from eBPF program */
static int htab_map_delete_elem(struct bpf_map *map, void *key)
{
@@ -676,6 +939,39 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
return ret;
}
+static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct bucket *b;
+ struct htab_elem *l;
+ unsigned long flags;
+ u32 hash, key_size;
+ int ret = -ENOENT;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (l) {
+ hlist_del_rcu(&l->hash_node);
+ ret = 0;
+ }
+
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ if (l)
+ bpf_lru_push_free(&htab->lru, &l->lru_node);
+ return ret;
+}
+
static void delete_all_elements(struct bpf_htab *htab)
{
int i;
@@ -687,7 +983,8 @@ static void delete_all_elements(struct bpf_htab *htab)
hlist_for_each_entry_safe(l, n, head, hash_node) {
hlist_del_rcu(&l->hash_node);
- htab_elem_free(htab, l);
+ if (l->state != HTAB_EXTRA_ELEM_USED)
+ htab_elem_free(htab, l);
}
}
}
@@ -707,14 +1004,13 @@ static void htab_map_free(struct bpf_map *map)
* not have executed. Wait for them.
*/
rcu_barrier();
- if (htab->map.map_flags & BPF_F_NO_PREALLOC) {
+ if (htab->map.map_flags & BPF_F_NO_PREALLOC)
delete_all_elements(htab);
- } else {
- htab_free_elems(htab);
- pcpu_freelist_destroy(&htab->freelist);
- }
+ else
+ prealloc_destroy(htab);
+
free_percpu(htab->extra_elems);
- kvfree(htab->buckets);
+ bpf_map_area_free(htab->buckets);
kfree(htab);
}
@@ -732,6 +1028,20 @@ static struct bpf_map_type_list htab_type __read_mostly = {
.type = BPF_MAP_TYPE_HASH,
};
+static const struct bpf_map_ops htab_lru_ops = {
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_lru_map_lookup_elem,
+ .map_update_elem = htab_lru_map_update_elem,
+ .map_delete_elem = htab_lru_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_lru_type __read_mostly = {
+ .ops = &htab_lru_ops,
+ .type = BPF_MAP_TYPE_LRU_HASH,
+};
+
/* Called from eBPF program */
static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
{
@@ -743,8 +1053,21 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
+ if (l) {
+ bpf_lru_node_set_ref(&l->lru_node);
+ return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
+ }
+
+ return NULL;
+}
+
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l;
void __percpu *pptr;
int ret = -ENOENT;
@@ -760,6 +1083,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
l = __htab_map_lookup_elem(map, key);
if (!l)
goto out;
+ if (htab_is_lru(htab))
+ bpf_lru_node_set_ref(&l->lru_node);
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
bpf_long_memcpy(value + off,
@@ -775,10 +1100,16 @@ out:
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
int ret;
rcu_read_lock();
- ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true);
+ if (htab_is_lru(htab))
+ ret = __htab_lru_percpu_map_update_elem(map, key, value,
+ map_flags, true);
+ else
+ ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
+ true);
rcu_read_unlock();
return ret;
@@ -798,10 +1129,26 @@ static struct bpf_map_type_list htab_percpu_type __read_mostly = {
.type = BPF_MAP_TYPE_PERCPU_HASH,
};
+static const struct bpf_map_ops htab_lru_percpu_ops = {
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_lru_percpu_map_lookup_elem,
+ .map_update_elem = htab_lru_percpu_map_update_elem,
+ .map_delete_elem = htab_lru_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = {
+ .ops = &htab_lru_percpu_ops,
+ .type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
+};
+
static int __init register_htab_map(void)
{
bpf_register_map_type(&htab_type);
bpf_register_map_type(&htab_percpu_type);
+ bpf_register_map_type(&htab_lru_type);
+ bpf_register_map_type(&htab_lru_percpu_type);
return 0;
}
late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1ea3afba1a4f..045cbe673356 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -13,9 +13,11 @@
#include <linux/rcupdate.h>
#include <linux/random.h>
#include <linux/smp.h>
+#include <linux/topology.h>
#include <linux/ktime.h>
#include <linux/sched.h>
#include <linux/uidgid.h>
+#include <linux/filter.h>
/* If kernel subsystem is allowing eBPF programs to call this function,
* inside its own verifier_ops->get_func_proto() callback it should return
@@ -26,48 +28,32 @@
* if program is allowed to access maps, so check rcu_read_lock_held in
* all three functions.
*/
-static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
{
- /* verifier checked that R1 contains a valid pointer to bpf_map
- * and R2 points to a program stack and map->key_size bytes were
- * initialized
- */
- struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
- void *key = (void *) (unsigned long) r2;
- void *value;
-
WARN_ON_ONCE(!rcu_read_lock_held());
-
- value = map->ops->map_lookup_elem(map, key);
-
- /* lookup() returns either pointer to element value or NULL
- * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
- */
- return (unsigned long) value;
+ return (unsigned long) map->ops->map_lookup_elem(map, key);
}
const struct bpf_func_proto bpf_map_lookup_elem_proto = {
.func = bpf_map_lookup_elem,
.gpl_only = false,
+ .pkt_access = true,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_MAP_KEY,
};
-static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
+ void *, value, u64, flags)
{
- struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
- void *key = (void *) (unsigned long) r2;
- void *value = (void *) (unsigned long) r3;
-
WARN_ON_ONCE(!rcu_read_lock_held());
-
- return map->ops->map_update_elem(map, key, value, r4);
+ return map->ops->map_update_elem(map, key, value, flags);
}
const struct bpf_func_proto bpf_map_update_elem_proto = {
.func = bpf_map_update_elem,
.gpl_only = false,
+ .pkt_access = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_MAP_KEY,
@@ -75,19 +61,16 @@ const struct bpf_func_proto bpf_map_update_elem_proto = {
.arg4_type = ARG_ANYTHING,
};
-static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
{
- struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
- void *key = (void *) (unsigned long) r2;
-
WARN_ON_ONCE(!rcu_read_lock_held());
-
return map->ops->map_delete_elem(map, key);
}
const struct bpf_func_proto bpf_map_delete_elem_proto = {
.func = bpf_map_delete_elem,
.gpl_only = false,
+ .pkt_access = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_MAP_KEY,
@@ -99,7 +82,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = {
.ret_type = RET_INTEGER,
};
-static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_smp_processor_id)
{
return smp_processor_id();
}
@@ -110,7 +93,18 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
.ret_type = RET_INTEGER,
};
-static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_numa_node_id)
+{
+ return numa_node_id();
+}
+
+const struct bpf_func_proto bpf_get_numa_node_id_proto = {
+ .func = bpf_get_numa_node_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_0(bpf_ktime_get_ns)
{
/* NMI safe access to clock monotonic */
return ktime_get_mono_fast_ns();
@@ -122,11 +116,11 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = {
.ret_type = RET_INTEGER,
};
-static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_current_pid_tgid)
{
struct task_struct *task = current;
- if (!task)
+ if (unlikely(!task))
return -EINVAL;
return (u64) task->tgid << 32 | task->pid;
@@ -138,18 +132,18 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
.ret_type = RET_INTEGER,
};
-static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_current_uid_gid)
{
struct task_struct *task = current;
kuid_t uid;
kgid_t gid;
- if (!task)
+ if (unlikely(!task))
return -EINVAL;
current_uid_gid(&uid, &gid);
return (u64) from_kgid(&init_user_ns, gid) << 32 |
- from_kuid(&init_user_ns, uid);
+ from_kuid(&init_user_ns, uid);
}
const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
@@ -158,10 +152,9 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
.ret_type = RET_INTEGER,
};
-static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
{
struct task_struct *task = current;
- char *buf = (char *) (long) r1;
if (unlikely(!task))
goto err_clear;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5967b870a895..0b030c9126d3 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -18,6 +18,7 @@
#include <linux/namei.h>
#include <linux/fs.h>
#include <linux/kdev_t.h>
+#include <linux/parser.h>
#include <linux/filter.h>
#include <linux/bpf.h>
@@ -87,6 +88,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,
switch (mode & S_IFMT) {
case S_IFDIR:
case S_IFREG:
+ case S_IFLNK:
break;
default:
return ERR_PTR(-EINVAL);
@@ -97,7 +99,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,
return ERR_PTR(-ENOSPC);
inode->i_ino = get_next_ino();
- inode->i_atime = CURRENT_TIME;
+ inode->i_atime = current_time(inode);
inode->i_mtime = inode->i_atime;
inode->i_ctime = inode->i_atime;
@@ -119,6 +121,16 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
return 0;
}
+static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
+ struct inode *dir)
+{
+ d_instantiate(dentry, inode);
+ dget(dentry);
+
+ dir->i_mtime = current_time(dir);
+ dir->i_ctime = dir->i_mtime;
+}
+
static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct inode *inode;
@@ -133,9 +145,7 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
inc_nlink(inode);
inc_nlink(dir);
- d_instantiate(dentry, inode);
- dget(dentry);
-
+ bpf_dentry_finalize(dentry, inode, dir);
return 0;
}
@@ -151,9 +161,7 @@ static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,
inode->i_op = iops;
inode->i_private = dentry->d_fsdata;
- d_instantiate(dentry, inode);
- dget(dentry);
-
+ bpf_dentry_finalize(dentry, inode, dir);
return 0;
}
@@ -181,13 +189,37 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
{
if (strchr(dentry->d_name.name, '.'))
return ERR_PTR(-EPERM);
+
return simple_lookup(dir, dentry, flags);
}
+static int bpf_symlink(struct inode *dir, struct dentry *dentry,
+ const char *target)
+{
+ char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
+ struct inode *inode;
+
+ if (!link)
+ return -ENOMEM;
+
+ inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK);
+ if (IS_ERR(inode)) {
+ kfree(link);
+ return PTR_ERR(inode);
+ }
+
+ inode->i_op = &simple_symlink_inode_operations;
+ inode->i_link = link;
+
+ bpf_dentry_finalize(dentry, inode, dir);
+ return 0;
+}
+
static const struct inode_operations bpf_dir_iops = {
.lookup = bpf_lookup,
.mknod = bpf_mkobj,
.mkdir = bpf_mkdir,
+ .symlink = bpf_symlink,
.rmdir = simple_rmdir,
.rename = simple_rename,
.link = simple_link,
@@ -324,6 +356,8 @@ static void bpf_evict_inode(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
+ if (S_ISLNK(inode->i_mode))
+ kfree(inode->i_link);
if (!bpf_inode_type(inode, &type))
bpf_any_put(inode->i_private, type);
}
@@ -331,15 +365,66 @@ static void bpf_evict_inode(struct inode *inode)
static const struct super_operations bpf_super_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
+ .show_options = generic_show_options,
.evict_inode = bpf_evict_inode,
};
+enum {
+ OPT_MODE,
+ OPT_ERR,
+};
+
+static const match_table_t bpf_mount_tokens = {
+ { OPT_MODE, "mode=%o" },
+ { OPT_ERR, NULL },
+};
+
+struct bpf_mount_opts {
+ umode_t mode;
+};
+
+static int bpf_parse_options(char *data, struct bpf_mount_opts *opts)
+{
+ substring_t args[MAX_OPT_ARGS];
+ int option, token;
+ char *ptr;
+
+ opts->mode = S_IRWXUGO;
+
+ while ((ptr = strsep(&data, ",")) != NULL) {
+ if (!*ptr)
+ continue;
+
+ token = match_token(ptr, bpf_mount_tokens, args);
+ switch (token) {
+ case OPT_MODE:
+ if (match_octal(&args[0], &option))
+ return -EINVAL;
+ opts->mode = option & S_IALLUGO;
+ break;
+ /* We might like to report bad mount options here, but
+ * traditionally we've ignored all mount options, so we'd
+ * better continue to ignore non-existing options for bpf.
+ */
+ }
+ }
+
+ return 0;
+}
+
static int bpf_fill_super(struct super_block *sb, void *data, int silent)
{
static struct tree_descr bpf_rfiles[] = { { "" } };
+ struct bpf_mount_opts opts;
struct inode *inode;
int ret;
+ save_mount_options(sb, data);
+
+ ret = bpf_parse_options(data, &opts);
+ if (ret)
+ return ret;
+
ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
if (ret)
return ret;
@@ -349,7 +434,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)
inode = sb->s_root->d_inode;
inode->i_op = &bpf_dir_iops;
inode->i_mode &= ~S_IALLUGO;
- inode->i_mode |= S_ISVTX | S_IRWXUGO;
+ inode->i_mode |= S_ISVTX | opts.mode;
return 0;
}
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index bf4495fcd25d..be8519148c25 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -7,7 +7,6 @@
#include <linux/bpf.h>
#include <linux/jhash.h>
#include <linux/filter.h>
-#include <linux/vmalloc.h>
#include <linux/stacktrace.h>
#include <linux/perf_event.h>
#include "percpu_freelist.h"
@@ -32,7 +31,7 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
int err;
- smap->elems = vzalloc(elem_size * smap->map.max_entries);
+ smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries);
if (!smap->elems)
return -ENOMEM;
@@ -45,7 +44,7 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
return 0;
free_elems:
- vfree(smap->elems);
+ bpf_map_area_free(smap->elems);
return err;
}
@@ -76,12 +75,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (cost >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-E2BIG);
- smap = kzalloc(cost, GFP_USER | __GFP_NOWARN);
- if (!smap) {
- smap = vzalloc(cost);
- if (!smap)
- return ERR_PTR(-ENOMEM);
- }
+ smap = bpf_map_area_alloc(cost);
+ if (!smap)
+ return ERR_PTR(-ENOMEM);
err = -E2BIG;
cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
@@ -112,14 +108,13 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
put_buffers:
put_callchain_buffers();
free_smap:
- kvfree(smap);
+ bpf_map_area_free(smap);
return ERR_PTR(err);
}
-u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
+ u64, flags)
{
- struct pt_regs *regs = (struct pt_regs *) (long) r1;
- struct bpf_map *map = (struct bpf_map *) (long) r2;
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct perf_callchain_entry *trace;
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
@@ -263,9 +258,9 @@ static void stack_map_free(struct bpf_map *map)
/* wait for bpf programs to complete before freeing stack map */
synchronize_rcu();
- vfree(smap->elems);
+ bpf_map_area_free(smap->elems);
pcpu_freelist_destroy(&smap->freelist);
- kvfree(smap);
+ bpf_map_area_free(smap);
put_callchain_buffers();
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 228f962447a5..bbb016adbaeb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -12,11 +12,14 @@
#include <linux/bpf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mmzone.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/license.h>
#include <linux/filter.h>
#include <linux/version.h>
+#include <linux/kernel.h>
DEFINE_PER_CPU(int, bpf_prog_active);
@@ -48,6 +51,30 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
list_add(&tl->list_node, &bpf_map_types);
}
+void *bpf_map_area_alloc(size_t size)
+{
+ /* We definitely need __GFP_NORETRY, so OOM killer doesn't
+ * trigger under memory pressure as we really just want to
+ * fail instead.
+ */
+ const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO;
+ void *area;
+
+ if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+ area = kmalloc(size, GFP_USER | flags);
+ if (area != NULL)
+ return area;
+ }
+
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | flags,
+ PAGE_KERNEL);
+}
+
+void bpf_map_area_free(void *area)
+{
+ kvfree(area);
+}
+
int bpf_map_precharge_memlock(u32 pages)
{
struct user_struct *user = get_current_user();
@@ -137,18 +164,31 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_map *map = filp->private_data;
+ const struct bpf_array *array;
+ u32 owner_prog_type = 0;
+
+ if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
+ array = container_of(map, struct bpf_array, map);
+ owner_prog_type = array->owner_prog_type;
+ }
seq_printf(m,
"map_type:\t%u\n"
"key_size:\t%u\n"
"value_size:\t%u\n"
"max_entries:\t%u\n"
- "map_flags:\t%#x\n",
+ "map_flags:\t%#x\n"
+ "memlock:\t%llu\n",
map->map_type,
map->key_size,
map->value_size,
map->max_entries,
- map->map_flags);
+ map->map_flags,
+ map->pages * 1ULL << PAGE_SHIFT);
+
+ if (owner_prog_type)
+ seq_printf(m, "owner_prog_type:\t%u\n",
+ owner_prog_type);
}
#endif
@@ -194,7 +234,7 @@ static int map_create(union bpf_attr *attr)
err = bpf_map_charge_memlock(map);
if (err)
- goto free_map;
+ goto free_map_nouncharge;
err = bpf_map_new_fd(map);
if (err < 0)
@@ -204,6 +244,8 @@ static int map_create(union bpf_attr *attr)
return err;
free_map:
+ bpf_map_uncharge_memlock(map);
+free_map_nouncharge:
map->ops->map_free(map);
return err;
}
@@ -252,12 +294,6 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
return map;
}
-/* helper to convert user pointers passed inside __aligned_u64 fields */
-static void __user *u64_to_ptr(__u64 val)
-{
- return (void __user *) (unsigned long) val;
-}
-
int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
{
return -ENOTSUPP;
@@ -268,8 +304,8 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
static int map_lookup_elem(union bpf_attr *attr)
{
- void __user *ukey = u64_to_ptr(attr->key);
- void __user *uvalue = u64_to_ptr(attr->value);
+ void __user *ukey = u64_to_user_ptr(attr->key);
+ void __user *uvalue = u64_to_user_ptr(attr->value);
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value, *ptr;
@@ -295,6 +331,7 @@ static int map_lookup_elem(union bpf_attr *attr)
goto free_key;
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
value_size = round_up(map->value_size, 8) * num_possible_cpus();
else
@@ -305,7 +342,8 @@ static int map_lookup_elem(union bpf_attr *attr)
if (!value)
goto free_key;
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value);
@@ -342,8 +380,8 @@ err_put:
static int map_update_elem(union bpf_attr *attr)
{
- void __user *ukey = u64_to_ptr(attr->key);
- void __user *uvalue = u64_to_ptr(attr->value);
+ void __user *ukey = u64_to_user_ptr(attr->key);
+ void __user *uvalue = u64_to_user_ptr(attr->value);
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
@@ -369,6 +407,7 @@ static int map_update_elem(union bpf_attr *attr)
goto free_key;
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
value_size = round_up(map->value_size, 8) * num_possible_cpus();
else
@@ -388,7 +427,8 @@ static int map_update_elem(union bpf_attr *attr)
*/
preempt_disable();
__this_cpu_inc(bpf_prog_active);
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_update(map, key, value, attr->flags);
@@ -420,7 +460,7 @@ err_put:
static int map_delete_elem(union bpf_attr *attr)
{
- void __user *ukey = u64_to_ptr(attr->key);
+ void __user *ukey = u64_to_user_ptr(attr->key);
int ufd = attr->map_fd;
struct bpf_map *map;
struct fd f;
@@ -464,8 +504,8 @@ err_put:
static int map_get_next_key(union bpf_attr *attr)
{
- void __user *ukey = u64_to_ptr(attr->key);
- void __user *unext_key = u64_to_ptr(attr->next_key);
+ void __user *ukey = u64_to_user_ptr(attr->key);
+ void __user *unext_key = u64_to_user_ptr(attr->next_key);
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *next_key;
@@ -565,6 +605,8 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
prog->dst_needed = 1;
if (insn->imm == BPF_FUNC_get_prandom_u32)
bpf_user_rnd_init_once();
+ if (insn->imm == BPF_FUNC_xdp_adjust_head)
+ prog->xdp_adjust_head = 1;
if (insn->imm == BPF_FUNC_tail_call) {
/* mark bpf_tail_call as different opcode
* to avoid conditional branch in
@@ -599,19 +641,39 @@ static void free_used_maps(struct bpf_prog_aux *aux)
kfree(aux->used_maps);
}
+int __bpf_prog_charge(struct user_struct *user, u32 pages)
+{
+ unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ unsigned long user_bufs;
+
+ if (user) {
+ user_bufs = atomic_long_add_return(pages, &user->locked_vm);
+ if (user_bufs > memlock_limit) {
+ atomic_long_sub(pages, &user->locked_vm);
+ return -EPERM;
+ }
+ }
+
+ return 0;
+}
+
+void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
+{
+ if (user)
+ atomic_long_sub(pages, &user->locked_vm);
+}
+
static int bpf_prog_charge_memlock(struct bpf_prog *prog)
{
struct user_struct *user = get_current_user();
- unsigned long memlock_limit;
-
- memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ int ret;
- atomic_long_add(prog->pages, &user->locked_vm);
- if (atomic_long_read(&user->locked_vm) > memlock_limit) {
- atomic_long_sub(prog->pages, &user->locked_vm);
+ ret = __bpf_prog_charge(user, prog->pages);
+ if (ret) {
free_uid(user);
- return -EPERM;
+ return ret;
}
+
prog->aux->user = user;
return 0;
}
@@ -620,7 +682,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
{
struct user_struct *user = prog->aux->user;
- atomic_long_sub(prog->pages, &user->locked_vm);
+ __bpf_prog_uncharge(user, prog->pages);
free_uid(user);
}
@@ -648,8 +710,30 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
return 0;
}
+#ifdef CONFIG_PROC_FS
+static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+ const struct bpf_prog *prog = filp->private_data;
+ char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
+
+ bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
+ seq_printf(m,
+ "prog_type:\t%u\n"
+ "prog_jited:\t%u\n"
+ "prog_tag:\t%s\n"
+ "memlock:\t%llu\n",
+ prog->type,
+ prog->jited,
+ prog_tag,
+ prog->pages * 1ULL << PAGE_SHIFT);
+}
+#endif
+
static const struct file_operations bpf_prog_fops = {
- .release = bpf_prog_release,
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_prog_show_fdinfo,
+#endif
+ .release = bpf_prog_release,
};
int bpf_prog_new_fd(struct bpf_prog *prog)
@@ -680,10 +764,22 @@ struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
}
EXPORT_SYMBOL_GPL(bpf_prog_add);
+void bpf_prog_sub(struct bpf_prog *prog, int i)
+{
+ /* Only to be used for undoing previous bpf_prog_add() in some
+ * error path. We still know that another entity in our call
+ * path holds a reference to the program, thus atomic_sub() can
+ * be safely used in such cases!
+ */
+ WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_sub);
+
struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
{
return bpf_prog_add(prog, 1);
}
+EXPORT_SYMBOL_GPL(bpf_prog_inc);
static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
{
@@ -730,7 +826,7 @@ static int bpf_prog_load(union bpf_attr *attr)
return -EINVAL;
/* copy eBPF program license from user space */
- if (strncpy_from_user(license, u64_to_ptr(attr->license),
+ if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
sizeof(license) - 1) < 0)
return -EFAULT;
license[sizeof(license) - 1] = 0;
@@ -738,8 +834,8 @@ static int bpf_prog_load(union bpf_attr *attr)
/* eBPF programs must be GPL compatible to use GPL-ed functions */
is_gpl = license_is_gpl_compatible(license);
- if (attr->insn_cnt >= BPF_MAXINSNS)
- return -EINVAL;
+ if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS)
+ return -E2BIG;
if (type == BPF_PROG_TYPE_KPROBE &&
attr->kern_version != LINUX_VERSION_CODE)
@@ -760,8 +856,8 @@ static int bpf_prog_load(union bpf_attr *attr)
prog->len = attr->insn_cnt;
err = -EFAULT;
- if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
- prog->len * sizeof(struct bpf_insn)) != 0)
+ if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
+ bpf_prog_insn_size(prog)) != 0)
goto free_prog;
prog->orig_prog = NULL;
@@ -811,7 +907,7 @@ static int bpf_obj_pin(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_OBJ))
return -EINVAL;
- return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname));
+ return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
}
static int bpf_obj_get(const union bpf_attr *attr)
@@ -819,8 +915,92 @@ static int bpf_obj_get(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
return -EINVAL;
- return bpf_obj_get_user(u64_to_ptr(attr->pathname));
+ return bpf_obj_get_user(u64_to_user_ptr(attr->pathname));
+}
+
+#ifdef CONFIG_CGROUP_BPF
+
+#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
+
+static int bpf_prog_attach(const union bpf_attr *attr)
+{
+ enum bpf_prog_type ptype;
+ struct bpf_prog *prog;
+ struct cgroup *cgrp;
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (CHECK_ATTR(BPF_PROG_ATTACH))
+ return -EINVAL;
+
+ if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+ return -EINVAL;
+
+ switch (attr->attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ ptype = BPF_PROG_TYPE_CGROUP_SKB;
+ break;
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp)) {
+ bpf_prog_put(prog);
+ return PTR_ERR(cgrp);
+ }
+
+ ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
+ attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+ if (ret)
+ bpf_prog_put(prog);
+ cgroup_put(cgrp);
+
+ return ret;
+}
+
+#define BPF_PROG_DETACH_LAST_FIELD attach_type
+
+static int bpf_prog_detach(const union bpf_attr *attr)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (CHECK_ATTR(BPF_PROG_DETACH))
+ return -EINVAL;
+
+ switch (attr->attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
+ cgroup_put(cgrp);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ return ret;
}
+#endif /* CONFIG_CGROUP_BPF */
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
@@ -888,6 +1068,16 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_OBJ_GET:
err = bpf_obj_get(&attr);
break;
+
+#ifdef CONFIG_CGROUP_BPF
+ case BPF_PROG_ATTACH:
+ err = bpf_prog_attach(&attr);
+ break;
+ case BPF_PROG_DETACH:
+ err = bpf_prog_detach(&attr);
+ break;
+#endif
+
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index daea765d72e6..cdc43b899f28 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14,10 +14,12 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
#include <linux/filter.h>
#include <net/netlink.h>
#include <linux/file.h>
#include <linux/vmalloc.h>
+#include <linux/stringify.h>
/* bpf_check() is a static code analyzer that walks eBPF program
* instruction by instruction and updates register/stack state.
@@ -126,76 +128,16 @@
* are set to NOT_INIT to indicate that they are no longer readable.
*/
-struct reg_state {
- enum bpf_reg_type type;
- union {
- /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
- s64 imm;
-
- /* valid when type == PTR_TO_PACKET* */
- struct {
- u32 id;
- u16 off;
- u16 range;
- };
-
- /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
- * PTR_TO_MAP_VALUE_OR_NULL
- */
- struct bpf_map *map_ptr;
- };
-};
-
-enum bpf_stack_slot_type {
- STACK_INVALID, /* nothing was stored in this stack slot */
- STACK_SPILL, /* register spilled into stack */
- STACK_MISC /* BPF program wrote some data into this slot */
-};
-
-#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
-
-/* state of the program:
- * type of all registers and stack info
- */
-struct verifier_state {
- struct reg_state regs[MAX_BPF_REG];
- u8 stack_slot_type[MAX_BPF_STACK];
- struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
-};
-
-/* linked list of verifier states used to prune search */
-struct verifier_state_list {
- struct verifier_state state;
- struct verifier_state_list *next;
-};
-
/* verifier_state + insn_idx are pushed to stack when branch is encountered */
-struct verifier_stack_elem {
+struct bpf_verifier_stack_elem {
/* verifer state is 'st'
* before processing instruction 'insn_idx'
* and after processing instruction 'prev_insn_idx'
*/
- struct verifier_state st;
+ struct bpf_verifier_state st;
int insn_idx;
int prev_insn_idx;
- struct verifier_stack_elem *next;
-};
-
-#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
-
-/* single container for all structs
- * one verifier_env per bpf_check() call
- */
-struct verifier_env {
- struct bpf_prog *prog; /* eBPF program being verified */
- struct verifier_stack_elem *head; /* stack of verifier states to be processed */
- int stack_size; /* number of states to be processed */
- struct verifier_state cur_state; /* current verifier state */
- struct verifier_state_list **explored_states; /* search pruning optimization */
- struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
- u32 used_map_cnt; /* number of used maps */
- u32 id_gen; /* used to generate unique reg IDs */
- bool allow_ptr_leaks;
+ struct bpf_verifier_stack_elem *next;
};
#define BPF_COMPLEXITY_LIMIT_INSNS 65536
@@ -204,6 +146,7 @@ struct verifier_env {
struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
bool raw_mode;
+ bool pkt_access;
int regno;
int access_size;
};
@@ -240,6 +183,7 @@ static const char * const reg_type_str[] = {
[CONST_PTR_TO_MAP] = "map_ptr",
[PTR_TO_MAP_VALUE] = "map_value",
[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
+ [PTR_TO_MAP_VALUE_ADJ] = "map_value_adj",
[FRAME_PTR] = "fp",
[PTR_TO_STACK] = "fp",
[CONST_IMM] = "imm",
@@ -247,9 +191,25 @@ static const char * const reg_type_str[] = {
[PTR_TO_PACKET_END] = "pkt_end",
};
-static void print_verifier_state(struct verifier_state *state)
+#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
+static const char * const func_id_str[] = {
+ __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
+};
+#undef __BPF_FUNC_STR_FN
+
+static const char *func_id_name(int id)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
+
+ if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
+ return func_id_str[id];
+ else
+ return "unknown";
+}
+
+static void print_verifier_state(struct bpf_verifier_state *state)
{
- struct reg_state *reg;
+ struct bpf_reg_state *reg;
enum bpf_reg_type t;
int i;
@@ -267,10 +227,18 @@ static void print_verifier_state(struct verifier_state *state)
else if (t == UNKNOWN_VALUE && reg->imm)
verbose("%lld", reg->imm);
else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
- t == PTR_TO_MAP_VALUE_OR_NULL)
- verbose("(ks=%d,vs=%d)",
+ t == PTR_TO_MAP_VALUE_OR_NULL ||
+ t == PTR_TO_MAP_VALUE_ADJ)
+ verbose("(ks=%d,vs=%d,id=%u)",
reg->map_ptr->key_size,
- reg->map_ptr->value_size);
+ reg->map_ptr->value_size,
+ reg->id);
+ if (reg->min_value != BPF_REGISTER_MIN_RANGE)
+ verbose(",min_value=%lld",
+ (long long)reg->min_value);
+ if (reg->max_value != BPF_REGISTER_MAX_RANGE)
+ verbose(",max_value=%llu",
+ (unsigned long long)reg->max_value);
}
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] == STACK_SPILL)
@@ -403,7 +371,8 @@ static void print_bpf_insn(struct bpf_insn *insn)
u8 opcode = BPF_OP(insn->code);
if (opcode == BPF_CALL) {
- verbose("(%02x) call %d\n", insn->code, insn->imm);
+ verbose("(%02x) call %s#%d\n", insn->code,
+ func_id_name(insn->imm), insn->imm);
} else if (insn->code == (BPF_JMP | BPF_JA)) {
verbose("(%02x) goto pc%+d\n",
insn->code, insn->off);
@@ -425,9 +394,9 @@ static void print_bpf_insn(struct bpf_insn *insn)
}
}
-static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
+static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
{
- struct verifier_stack_elem *elem;
+ struct bpf_verifier_stack_elem *elem;
int insn_idx;
if (env->head == NULL)
@@ -444,12 +413,12 @@ static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
return insn_idx;
}
-static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
- int prev_insn_idx)
+static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
+ int insn_idx, int prev_insn_idx)
{
- struct verifier_stack_elem *elem;
+ struct bpf_verifier_stack_elem *elem;
- elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL);
+ elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
if (!elem)
goto err;
@@ -475,13 +444,15 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-static void init_reg_state(struct reg_state *regs)
+static void init_reg_state(struct bpf_reg_state *regs)
{
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
regs[i].type = NOT_INIT;
regs[i].imm = 0;
+ regs[i].min_value = BPF_REGISTER_MIN_RANGE;
+ regs[i].max_value = BPF_REGISTER_MAX_RANGE;
}
/* frame pointer */
@@ -491,20 +462,32 @@ static void init_reg_state(struct reg_state *regs)
regs[BPF_REG_1].type = PTR_TO_CTX;
}
-static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
+static void __mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
{
- BUG_ON(regno >= MAX_BPF_REG);
regs[regno].type = UNKNOWN_VALUE;
+ regs[regno].id = 0;
regs[regno].imm = 0;
}
+static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+{
+ BUG_ON(regno >= MAX_BPF_REG);
+ __mark_reg_unknown_value(regs, regno);
+}
+
+static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
+{
+ regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
+ regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+}
+
enum reg_arg_type {
SRC_OP, /* register is used as source operand */
DST_OP, /* register is used as destination operand */
DST_OP_NO_MARK /* same as above, check only, don't mark */
};
-static int check_reg_arg(struct reg_state *regs, u32 regno,
+static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
enum reg_arg_type t)
{
if (regno >= MAX_BPF_REG) {
@@ -564,8 +547,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
/* check_stack_read/write functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
-static int check_stack_write(struct verifier_state *state, int off, int size,
- int value_regno)
+static int check_stack_write(struct bpf_verifier_state *state, int off,
+ int size, int value_regno)
{
int i;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
@@ -590,7 +573,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
} else {
/* regular write of data into stack */
state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
- (struct reg_state) {};
+ (struct bpf_reg_state) {};
for (i = 0; i < size; i++)
state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
@@ -598,7 +581,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
return 0;
}
-static int check_stack_read(struct verifier_state *state, int off, int size,
+static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
int value_regno)
{
u8 *slot_type;
@@ -639,7 +622,7 @@ static int check_stack_read(struct verifier_state *state, int off, int size,
}
/* check read/write into map element returned by bpf_map_lookup_elem() */
-static int check_map_access(struct verifier_env *env, u32 regno, int off,
+static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
int size)
{
struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
@@ -654,24 +637,38 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
#define MAX_PACKET_OFF 0xffff
-static bool may_write_pkt_data(enum bpf_prog_type type)
+static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
+ const struct bpf_call_arg_meta *meta,
+ enum bpf_access_type t)
{
- switch (type) {
+ switch (env->prog->type) {
+ case BPF_PROG_TYPE_LWT_IN:
+ case BPF_PROG_TYPE_LWT_OUT:
+ /* dst_input() and dst_output() can't write for now */
+ if (t == BPF_WRITE)
+ return false;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ case BPF_PROG_TYPE_SCHED_ACT:
case BPF_PROG_TYPE_XDP:
+ case BPF_PROG_TYPE_LWT_XMIT:
+ if (meta)
+ return meta->pkt_access;
+
+ env->seen_direct_write = true;
return true;
default:
return false;
}
}
-static int check_packet_access(struct verifier_env *env, u32 regno, int off,
+static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
int size)
{
- struct reg_state *regs = env->cur_state.regs;
- struct reg_state *reg = &regs[regno];
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *reg = &regs[regno];
off += reg->off;
- if (off < 0 || off + size > reg->range) {
+ if (off < 0 || size <= 0 || off + size > reg->range) {
verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
off, size, regno, reg->id, reg->off, reg->range);
return -EACCES;
@@ -680,9 +677,13 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off,
}
/* check access to 'struct bpf_context' fields */
-static int check_ctx_access(struct verifier_env *env, int off, int size,
+static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
enum bpf_access_type t, enum bpf_reg_type *reg_type)
{
+ /* for analyzer ctx accesses are already validated and converted */
+ if (env->analyzer_ops)
+ return 0;
+
if (env->prog->aux->ops->is_valid_access &&
env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) {
/* remember the offset of last byte accessed in ctx */
@@ -695,7 +696,7 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
return -EACCES;
}
-static bool is_pointer_value(struct verifier_env *env, int regno)
+static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
{
if (env->allow_ptr_leaks)
return false;
@@ -709,28 +710,19 @@ static bool is_pointer_value(struct verifier_env *env, int regno)
}
}
-static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
- int off, int size)
+static int check_ptr_alignment(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int off, int size)
{
- if (reg->type != PTR_TO_PACKET) {
+ if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) {
if (off % size != 0) {
- verbose("misaligned access off %d size %d\n", off, size);
+ verbose("misaligned access off %d size %d\n",
+ off, size);
return -EACCES;
} else {
return 0;
}
}
- switch (env->prog->type) {
- case BPF_PROG_TYPE_SCHED_CLS:
- case BPF_PROG_TYPE_SCHED_ACT:
- case BPF_PROG_TYPE_XDP:
- break;
- default:
- verbose("verifier is misconfigured\n");
- return -EACCES;
- }
-
if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
/* misaligned access to packet is ok on x86,arm,arm64 */
return 0;
@@ -741,7 +733,8 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
}
/* skb->data is NET_IP_ALIGN-ed */
- if ((NET_IP_ALIGN + reg->off + off) % size != 0) {
+ if (reg->type == PTR_TO_PACKET &&
+ (NET_IP_ALIGN + reg->off + off) % size != 0) {
verbose("misaligned packet access off %d+%d+%d size %d\n",
NET_IP_ALIGN, reg->off, off, size);
return -EACCES;
@@ -755,12 +748,12 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
* if t==write && value_regno==-1, some unknown value is stored into memory
* if t==read && value_regno==-1, don't care what we read from memory
*/
-static int check_mem_access(struct verifier_env *env, u32 regno, int off,
+static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
int bpf_size, enum bpf_access_type t,
int value_regno)
{
- struct verifier_state *state = &env->cur_state;
- struct reg_state *reg = &state->regs[regno];
+ struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_reg_state *reg = &state->regs[regno];
int size, err = 0;
if (reg->type == PTR_TO_STACK)
@@ -774,12 +767,52 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
if (err)
return err;
- if (reg->type == PTR_TO_MAP_VALUE) {
+ if (reg->type == PTR_TO_MAP_VALUE ||
+ reg->type == PTR_TO_MAP_VALUE_ADJ) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose("R%d leaks addr into map\n", value_regno);
return -EACCES;
}
+
+ /* If we adjusted the register to this map value at all then we
+ * need to change off and size to min_value and max_value
+ * respectively to make sure our theoretical access will be
+ * safe.
+ */
+ if (reg->type == PTR_TO_MAP_VALUE_ADJ) {
+ if (log_level)
+ print_verifier_state(state);
+ env->varlen_map_value_access = true;
+ /* The minimum value is only important with signed
+ * comparisons where we can't assume the floor of a
+ * value is 0. If we are using signed variables for our
+ * index'es we need to make sure that whatever we use
+ * will have a set floor within our range.
+ */
+ if (reg->min_value < 0) {
+ verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_map_access(env, regno, reg->min_value + off,
+ size);
+ if (err) {
+ verbose("R%d min value is outside of the array range\n",
+ regno);
+ return err;
+ }
+
+ /* If we haven't set a max value then we need to bail
+ * since we can't be sure we won't do bad things.
+ */
+ if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
+ verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+ regno);
+ return -EACCES;
+ }
+ off += reg->max_value;
+ }
err = check_map_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);
@@ -795,9 +828,8 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
err = check_ctx_access(env, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
mark_reg_unknown_value(state->regs, value_regno);
- if (env->allow_ptr_leaks)
- /* note that reg.[id|off|range] == 0 */
- state->regs[value_regno].type = reg_type;
+ /* note that reg.[id|off|range] == 0 */
+ state->regs[value_regno].type = reg_type;
}
} else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
@@ -817,7 +849,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
err = check_stack_read(state, off, size, value_regno);
}
} else if (state->regs[regno].type == PTR_TO_PACKET) {
- if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) {
+ if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
verbose("cannot write into packet\n");
return -EACCES;
}
@@ -846,9 +878,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
return err;
}
-static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
+static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = env->cur_state.regs;
int err;
if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
@@ -882,12 +914,12 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
* bytes from that pointer, make sure that it's within stack boundary
* and all elements of stack are initialized
*/
-static int check_stack_boundary(struct verifier_env *env, int regno,
+static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
- struct verifier_state *state = &env->cur_state;
- struct reg_state *regs = state->regs;
+ struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_reg_state *regs = state->regs;
int off, i;
if (regs[regno].type != PTR_TO_STACK) {
@@ -926,18 +958,18 @@ static int check_stack_boundary(struct verifier_env *env, int regno,
return 0;
}
-static int check_func_arg(struct verifier_env *env, u32 regno,
+static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
enum bpf_arg_type arg_type,
struct bpf_call_arg_meta *meta)
{
- struct reg_state *reg = env->cur_state.regs + regno;
- enum bpf_reg_type expected_type;
+ struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+ enum bpf_reg_type expected_type, type = reg->type;
int err = 0;
if (arg_type == ARG_DONTCARE)
return 0;
- if (reg->type == NOT_INIT) {
+ if (type == NOT_INIT) {
verbose("R%d !read_ok\n", regno);
return -EACCES;
}
@@ -950,16 +982,30 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
return 0;
}
+ if (type == PTR_TO_PACKET &&
+ !may_access_direct_pkt_data(env, meta, BPF_READ)) {
+ verbose("helper access to the packet is not allowed\n");
+ return -EACCES;
+ }
+
if (arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
+ if (type != PTR_TO_PACKET && type != expected_type)
+ goto err_type;
} else if (arg_type == ARG_CONST_STACK_SIZE ||
arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
expected_type = CONST_IMM;
+ if (type != expected_type)
+ goto err_type;
} else if (arg_type == ARG_CONST_MAP_PTR) {
expected_type = CONST_PTR_TO_MAP;
+ if (type != expected_type)
+ goto err_type;
} else if (arg_type == ARG_PTR_TO_CTX) {
expected_type = PTR_TO_CTX;
+ if (type != expected_type)
+ goto err_type;
} else if (arg_type == ARG_PTR_TO_STACK ||
arg_type == ARG_PTR_TO_RAW_STACK) {
expected_type = PTR_TO_STACK;
@@ -967,20 +1013,16 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
* passed in as argument, it's a CONST_IMM type. Final test
* happens during stack boundary checking.
*/
- if (reg->type == CONST_IMM && reg->imm == 0)
- expected_type = CONST_IMM;
+ if (type == CONST_IMM && reg->imm == 0)
+ /* final test in check_stack_boundary() */;
+ else if (type != PTR_TO_PACKET && type != expected_type)
+ goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK;
} else {
verbose("unsupported arg_type %d\n", arg_type);
return -EFAULT;
}
- if (reg->type != expected_type) {
- verbose("R%d type=%s expected=%s\n", regno,
- reg_type_str[reg->type], reg_type_str[expected_type]);
- return -EACCES;
- }
-
if (arg_type == ARG_CONST_MAP_PTR) {
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
meta->map_ptr = reg->map_ptr;
@@ -998,8 +1040,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
verbose("invalid map_ptr to access map->key\n");
return -EACCES;
}
- err = check_stack_boundary(env, regno, meta->map_ptr->key_size,
- false, NULL);
+ if (type == PTR_TO_PACKET)
+ err = check_packet_access(env, regno, 0,
+ meta->map_ptr->key_size);
+ else
+ err = check_stack_boundary(env, regno,
+ meta->map_ptr->key_size,
+ false, NULL);
} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
/* bpf_map_xxx(..., map_ptr, ..., value) call:
* check [value, value + map->value_size) validity
@@ -1009,9 +1056,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
verbose("invalid map_ptr to access map->value\n");
return -EACCES;
}
- err = check_stack_boundary(env, regno,
- meta->map_ptr->value_size,
- false, NULL);
+ if (type == PTR_TO_PACKET)
+ err = check_packet_access(env, regno, 0,
+ meta->map_ptr->value_size);
+ else
+ err = check_stack_boundary(env, regno,
+ meta->map_ptr->value_size,
+ false, NULL);
} else if (arg_type == ARG_CONST_STACK_SIZE ||
arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
@@ -1025,11 +1076,18 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
return -EACCES;
}
- err = check_stack_boundary(env, regno - 1, reg->imm,
- zero_size_allowed, meta);
+ if (regs[regno - 1].type == PTR_TO_PACKET)
+ err = check_packet_access(env, regno - 1, 0, reg->imm);
+ else
+ err = check_stack_boundary(env, regno - 1, reg->imm,
+ zero_size_allowed, meta);
}
return err;
+err_type:
+ verbose("R%d type=%s expected=%s\n", regno,
+ reg_type_str[type], reg_type_str[expected_type]);
+ return -EACCES;
}
static int check_map_func_compatibility(struct bpf_map *map, int func_id)
@@ -1053,7 +1111,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
goto error;
break;
case BPF_MAP_TYPE_CGROUP_ARRAY:
- if (func_id != BPF_FUNC_skb_under_cgroup)
+ if (func_id != BPF_FUNC_skb_under_cgroup &&
+ func_id != BPF_FUNC_current_task_under_cgroup)
goto error;
break;
default:
@@ -1075,6 +1134,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
break;
+ case BPF_FUNC_current_task_under_cgroup:
case BPF_FUNC_skb_under_cgroup:
if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
goto error;
@@ -1085,8 +1145,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
return 0;
error:
- verbose("cannot pass map_type %d into func %d\n",
- map->map_type, func_id);
+ verbose("cannot pass map_type %d into func %s#%d\n",
+ map->map_type, func_id_name(func_id), func_id);
return -EINVAL;
}
@@ -1108,10 +1168,10 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
return count > 1 ? -EINVAL : 0;
}
-static void clear_all_pkt_pointers(struct verifier_env *env)
+static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
- struct verifier_state *state = &env->cur_state;
- struct reg_state *regs = state->regs, *reg;
+ struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_reg_state *regs = state->regs, *reg;
int i;
for (i = 0; i < MAX_BPF_REG; i++)
@@ -1131,19 +1191,19 @@ static void clear_all_pkt_pointers(struct verifier_env *env)
}
}
-static int check_call(struct verifier_env *env, int func_id)
+static int check_call(struct bpf_verifier_env *env, int func_id)
{
- struct verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = &env->cur_state;
const struct bpf_func_proto *fn = NULL;
- struct reg_state *regs = state->regs;
- struct reg_state *reg;
+ struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *reg;
struct bpf_call_arg_meta meta;
bool changes_data;
int i, err;
/* find function prototype */
if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
- verbose("invalid func %d\n", func_id);
+ verbose("invalid func %s#%d\n", func_id_name(func_id), func_id);
return -EINVAL;
}
@@ -1151,7 +1211,7 @@ static int check_call(struct verifier_env *env, int func_id)
fn = env->prog->aux->ops->get_func_proto(func_id);
if (!fn) {
- verbose("unknown func %d\n", func_id);
+ verbose("unknown func %s#%d\n", func_id_name(func_id), func_id);
return -EINVAL;
}
@@ -1161,16 +1221,18 @@ static int check_call(struct verifier_env *env, int func_id)
return -EINVAL;
}
- changes_data = bpf_helper_changes_skb_data(fn->func);
+ changes_data = bpf_helper_changes_pkt_data(fn->func);
memset(&meta, 0, sizeof(meta));
+ meta.pkt_access = fn->pkt_access;
/* We only support one arg being in raw mode at the moment, which
* is sufficient for the helper functions we have right now.
*/
err = check_raw_mode(fn);
if (err) {
- verbose("kernel subsystem misconfigured func %d\n", func_id);
+ verbose("kernel subsystem misconfigured func %s#%d\n",
+ func_id_name(func_id), func_id);
return err;
}
@@ -1214,6 +1276,7 @@ static int check_call(struct verifier_env *env, int func_id)
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0;
/* remember map_ptr, so that check_map_access()
* can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem()
@@ -1223,9 +1286,10 @@ static int check_call(struct verifier_env *env, int func_id)
return -EINVAL;
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
+ regs[BPF_REG_0].id = ++env->id_gen;
} else {
- verbose("unknown return type %d of func %d\n",
- fn->ret_type, func_id);
+ verbose("unknown return type %d of func %s#%d\n",
+ fn->ret_type, func_id_name(func_id), func_id);
return -EINVAL;
}
@@ -1238,12 +1302,13 @@ static int check_call(struct verifier_env *env, int func_id)
return 0;
}
-static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn)
+static int check_packet_ptr_add(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
- struct reg_state *dst_reg = &regs[insn->dst_reg];
- struct reg_state *src_reg = &regs[insn->src_reg];
- struct reg_state tmp_reg;
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+ struct bpf_reg_state tmp_reg;
s32 imm;
if (BPF_SRC(insn->code) == BPF_K) {
@@ -1311,10 +1376,10 @@ add_imm:
return 0;
}
-static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn)
+static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
- struct reg_state *dst_reg = &regs[insn->dst_reg];
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
u8 opcode = BPF_OP(insn->code);
s64 imm_log2;
@@ -1324,7 +1389,7 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn)
*/
if (BPF_SRC(insn->code) == BPF_X) {
- struct reg_state *src_reg = &regs[insn->src_reg];
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 &&
dst_reg->imm && opcode == BPF_ADD) {
@@ -1413,30 +1478,158 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn)
return 0;
}
-static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn)
+static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
- struct reg_state *dst_reg = &regs[insn->dst_reg];
- struct reg_state *src_reg = &regs[insn->src_reg];
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
u8 opcode = BPF_OP(insn->code);
- /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn.
- * Don't care about overflow or negative values, just add them
+ /* dst_reg->type == CONST_IMM here, simulate execution of 'add'/'or'
+ * insn. Don't care about overflow or negative values, just add them
*/
if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K)
dst_reg->imm += insn->imm;
else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
src_reg->type == CONST_IMM)
dst_reg->imm += src_reg->imm;
+ else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K)
+ dst_reg->imm |= insn->imm;
+ else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == CONST_IMM)
+ dst_reg->imm |= src_reg->imm;
else
mark_reg_unknown_value(regs, insn->dst_reg);
return 0;
}
+static void check_reg_overflow(struct bpf_reg_state *reg)
+{
+ if (reg->max_value > BPF_REGISTER_MAX_RANGE)
+ reg->max_value = BPF_REGISTER_MAX_RANGE;
+ if (reg->min_value < BPF_REGISTER_MIN_RANGE ||
+ reg->min_value > BPF_REGISTER_MAX_RANGE)
+ reg->min_value = BPF_REGISTER_MIN_RANGE;
+}
+
+static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+ s64 min_val = BPF_REGISTER_MIN_RANGE;
+ u64 max_val = BPF_REGISTER_MAX_RANGE;
+ u8 opcode = BPF_OP(insn->code);
+
+ dst_reg = &regs[insn->dst_reg];
+ if (BPF_SRC(insn->code) == BPF_X) {
+ check_reg_overflow(&regs[insn->src_reg]);
+ min_val = regs[insn->src_reg].min_value;
+ max_val = regs[insn->src_reg].max_value;
+
+ /* If the source register is a random pointer then the
+ * min_value/max_value values represent the range of the known
+ * accesses into that value, not the actual min/max value of the
+ * register itself. In this case we have to reset the reg range
+ * values so we know it is not safe to look at.
+ */
+ if (regs[insn->src_reg].type != CONST_IMM &&
+ regs[insn->src_reg].type != UNKNOWN_VALUE) {
+ min_val = BPF_REGISTER_MIN_RANGE;
+ max_val = BPF_REGISTER_MAX_RANGE;
+ }
+ } else if (insn->imm < BPF_REGISTER_MAX_RANGE &&
+ (s64)insn->imm > BPF_REGISTER_MIN_RANGE) {
+ min_val = max_val = insn->imm;
+ }
+
+ /* We don't know anything about what was done to this register, mark it
+ * as unknown.
+ */
+ if (min_val == BPF_REGISTER_MIN_RANGE &&
+ max_val == BPF_REGISTER_MAX_RANGE) {
+ reset_reg_range_values(regs, insn->dst_reg);
+ return;
+ }
+
+ /* If one of our values was at the end of our ranges then we can't just
+ * do our normal operations to the register, we need to set the values
+ * to the min/max since they are undefined.
+ */
+ if (min_val == BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ if (max_val == BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+
+ switch (opcode) {
+ case BPF_ADD:
+ if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value += min_val;
+ if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value += max_val;
+ break;
+ case BPF_SUB:
+ if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value -= min_val;
+ if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value -= max_val;
+ break;
+ case BPF_MUL:
+ if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value *= min_val;
+ if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value *= max_val;
+ break;
+ case BPF_AND:
+ /* Disallow AND'ing of negative numbers, ain't nobody got time
+ * for that. Otherwise the minimum is 0 and the max is the max
+ * value we could AND against.
+ */
+ if (min_val < 0)
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ else
+ dst_reg->min_value = 0;
+ dst_reg->max_value = max_val;
+ break;
+ case BPF_LSH:
+ /* Gotta have special overflow logic here, if we're shifting
+ * more than MAX_RANGE then just assume we have an invalid
+ * range.
+ */
+ if (min_val > ilog2(BPF_REGISTER_MAX_RANGE))
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value <<= min_val;
+
+ if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
+ dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value <<= max_val;
+ break;
+ case BPF_RSH:
+ /* RSH by a negative number is undefined, and the BPF_RSH is an
+ * unsigned shift, so make the appropriate casts.
+ */
+ if (min_val < 0 || dst_reg->min_value < 0)
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ else
+ dst_reg->min_value =
+ (u64)(dst_reg->min_value) >> min_val;
+ if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value >>= max_val;
+ break;
+ default:
+ reset_reg_range_values(regs, insn->dst_reg);
+ break;
+ }
+
+ check_reg_overflow(dst_reg);
+}
+
/* check validity of 32-bit and 64-bit arithmetic operations */
-static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
+static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs, *dst_reg;
+ struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
u8 opcode = BPF_OP(insn->code);
int err;
@@ -1496,6 +1689,11 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
+ /* we are setting our register to something new, we need to
+ * reset its range values.
+ */
+ reset_reg_range_values(regs, insn->dst_reg);
+
if (BPF_SRC(insn->code) == BPF_X) {
if (BPF_CLASS(insn->code) == BPF_ALU64) {
/* case: R1 = R2
@@ -1508,8 +1706,7 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
insn->src_reg);
return -EACCES;
}
- regs[insn->dst_reg].type = UNKNOWN_VALUE;
- regs[insn->dst_reg].map_ptr = NULL;
+ mark_reg_unknown_value(regs, insn->dst_reg);
}
} else {
/* case: R = imm
@@ -1517,6 +1714,8 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
*/
regs[insn->dst_reg].type = CONST_IMM;
regs[insn->dst_reg].imm = insn->imm;
+ regs[insn->dst_reg].max_value = insn->imm;
+ regs[insn->dst_reg].min_value = insn->imm;
}
} else if (opcode > BPF_END) {
@@ -1569,6 +1768,9 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
dst_reg = &regs[insn->dst_reg];
+ /* first we want to adjust our ranges. */
+ adjust_reg_min_max_vals(env, insn);
+
/* pattern match 'bpf_add Rx, imm' instruction */
if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) {
@@ -1603,28 +1805,58 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
return -EACCES;
}
- /* mark dest operand */
- mark_reg_unknown_value(regs, insn->dst_reg);
+ /* If we did pointer math on a map value then just set it to our
+ * PTR_TO_MAP_VALUE_ADJ type so we can deal with any stores or
+ * loads to this register appropriately, otherwise just mark the
+ * register as unknown.
+ */
+ if (env->allow_ptr_leaks &&
+ (dst_reg->type == PTR_TO_MAP_VALUE ||
+ dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
+ dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
+ else
+ mark_reg_unknown_value(regs, insn->dst_reg);
}
return 0;
}
-static void find_good_pkt_pointers(struct verifier_env *env,
- struct reg_state *dst_reg)
+static void find_good_pkt_pointers(struct bpf_verifier_state *state,
+ struct bpf_reg_state *dst_reg)
{
- struct verifier_state *state = &env->cur_state;
- struct reg_state *regs = state->regs, *reg;
+ struct bpf_reg_state *regs = state->regs, *reg;
int i;
- /* r2 = r3;
- * r2 += 8
- * if (r2 > pkt_end) goto somewhere
- * r2 == dst_reg, pkt_end == src_reg,
- * r2=pkt(id=n,off=8,r=0)
- * r3=pkt(id=n,off=0,r=0)
- * find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
- * so that range of bytes [r3, r3 + 8) is safe to access
+
+ /* LLVM can generate two kind of checks:
+ *
+ * Type 1:
+ *
+ * r2 = r3;
+ * r2 += 8;
+ * if (r2 > pkt_end) goto <handle exception>
+ * <access okay>
+ *
+ * Where:
+ * r2 == dst_reg, pkt_end == src_reg
+ * r2=pkt(id=n,off=8,r=0)
+ * r3=pkt(id=n,off=0,r=0)
+ *
+ * Type 2:
+ *
+ * r2 = r3;
+ * r2 += 8;
+ * if (pkt_end >= r2) goto <access okay>
+ * <handle exception>
+ *
+ * Where:
+ * pkt_end == dst_reg, r2 == src_reg
+ * r2=pkt(id=n,off=8,r=0)
+ * r3=pkt(id=n,off=0,r=0)
+ *
+ * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
+ * so that range of bytes [r3, r3 + 8) is safe to access.
*/
+
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
regs[i].range = dst_reg->off;
@@ -1638,11 +1870,146 @@ static void find_good_pkt_pointers(struct verifier_env *env,
}
}
-static int check_cond_jmp_op(struct verifier_env *env,
+/* Adjusts the register min/max values in the case that the dst_reg is the
+ * variable register that we are working on, and src_reg is a constant or we're
+ * simply doing a BPF_K check.
+ */
+static void reg_set_min_max(struct bpf_reg_state *true_reg,
+ struct bpf_reg_state *false_reg, u64 val,
+ u8 opcode)
+{
+ switch (opcode) {
+ case BPF_JEQ:
+ /* If this is false then we know nothing Jon Snow, but if it is
+ * true then we know for sure.
+ */
+ true_reg->max_value = true_reg->min_value = val;
+ break;
+ case BPF_JNE:
+ /* If this is true we know nothing Jon Snow, but if it is false
+ * we know the value for sure;
+ */
+ false_reg->max_value = false_reg->min_value = val;
+ break;
+ case BPF_JGT:
+ /* Unsigned comparison, the minimum value is 0. */
+ false_reg->min_value = 0;
+ case BPF_JSGT:
+ /* If this is false then we know the maximum val is val,
+ * otherwise we know the min val is val+1.
+ */
+ false_reg->max_value = val;
+ true_reg->min_value = val + 1;
+ break;
+ case BPF_JGE:
+ /* Unsigned comparison, the minimum value is 0. */
+ false_reg->min_value = 0;
+ case BPF_JSGE:
+ /* If this is false then we know the maximum value is val - 1,
+ * otherwise we know the mimimum value is val.
+ */
+ false_reg->max_value = val - 1;
+ true_reg->min_value = val;
+ break;
+ default:
+ break;
+ }
+
+ check_reg_overflow(false_reg);
+ check_reg_overflow(true_reg);
+}
+
+/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
+ * is the variable reg.
+ */
+static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
+ struct bpf_reg_state *false_reg, u64 val,
+ u8 opcode)
+{
+ switch (opcode) {
+ case BPF_JEQ:
+ /* If this is false then we know nothing Jon Snow, but if it is
+ * true then we know for sure.
+ */
+ true_reg->max_value = true_reg->min_value = val;
+ break;
+ case BPF_JNE:
+ /* If this is true we know nothing Jon Snow, but if it is false
+ * we know the value for sure;
+ */
+ false_reg->max_value = false_reg->min_value = val;
+ break;
+ case BPF_JGT:
+ /* Unsigned comparison, the minimum value is 0. */
+ true_reg->min_value = 0;
+ case BPF_JSGT:
+ /*
+ * If this is false, then the val is <= the register, if it is
+ * true the register <= to the val.
+ */
+ false_reg->min_value = val;
+ true_reg->max_value = val - 1;
+ break;
+ case BPF_JGE:
+ /* Unsigned comparison, the minimum value is 0. */
+ true_reg->min_value = 0;
+ case BPF_JSGE:
+ /* If this is false then constant < register, if it is true then
+ * the register < constant.
+ */
+ false_reg->min_value = val + 1;
+ true_reg->max_value = val;
+ break;
+ default:
+ break;
+ }
+
+ check_reg_overflow(false_reg);
+ check_reg_overflow(true_reg);
+}
+
+static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
+ enum bpf_reg_type type)
+{
+ struct bpf_reg_state *reg = &regs[regno];
+
+ if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
+ reg->type = type;
+ /* We don't need id from this point onwards anymore, thus we
+ * should better reset it, so that state pruning has chances
+ * to take effect.
+ */
+ reg->id = 0;
+ if (type == UNKNOWN_VALUE)
+ __mark_reg_unknown_value(regs, regno);
+ }
+}
+
+/* The logic is similar to find_good_pkt_pointers(), both could eventually
+ * be folded together at some point.
+ */
+static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
+ enum bpf_reg_type type)
+{
+ struct bpf_reg_state *regs = state->regs;
+ u32 id = regs[regno].id;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++)
+ mark_map_reg(regs, i, id, type);
+
+ for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+ if (state->stack_slot_type[i] != STACK_SPILL)
+ continue;
+ mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, type);
+ }
+}
+
+static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_insn *insn, int *insn_idx)
{
- struct reg_state *regs = env->cur_state.regs, *dst_reg;
- struct verifier_state *other_branch;
+ struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state;
+ struct bpf_reg_state *regs = this_branch->regs, *dst_reg;
u8 opcode = BPF_OP(insn->code);
int err;
@@ -1704,32 +2071,48 @@ static int check_cond_jmp_op(struct verifier_env *env,
if (!other_branch)
return -EFAULT;
- /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
+ /* detect if we are comparing against a constant value so we can adjust
+ * our min/max values for our dst register.
+ */
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (regs[insn->src_reg].type == CONST_IMM)
+ reg_set_min_max(&other_branch->regs[insn->dst_reg],
+ dst_reg, regs[insn->src_reg].imm,
+ opcode);
+ else if (dst_reg->type == CONST_IMM)
+ reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
+ &regs[insn->src_reg], dst_reg->imm,
+ opcode);
+ } else {
+ reg_set_min_max(&other_branch->regs[insn->dst_reg],
+ dst_reg, insn->imm, opcode);
+ }
+
+ /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */
if (BPF_SRC(insn->code) == BPF_K &&
insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
- if (opcode == BPF_JEQ) {
- /* next fallthrough insn can access memory via
- * this register
- */
- regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
- /* branch targer cannot access it, since reg == 0 */
- mark_reg_unknown_value(other_branch->regs,
- insn->dst_reg);
- } else {
- other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
- mark_reg_unknown_value(regs, insn->dst_reg);
- }
+ /* Mark all identical map registers in each branch as either
+ * safe or unknown depending R == 0 or R != 0 conditional.
+ */
+ mark_map_regs(this_branch, insn->dst_reg,
+ opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE);
+ mark_map_regs(other_branch, insn->dst_reg,
+ opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
dst_reg->type == PTR_TO_PACKET &&
regs[insn->src_reg].type == PTR_TO_PACKET_END) {
- find_good_pkt_pointers(env, dst_reg);
+ find_good_pkt_pointers(this_branch, dst_reg);
+ } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
+ dst_reg->type == PTR_TO_PACKET_END &&
+ regs[insn->src_reg].type == PTR_TO_PACKET) {
+ find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
} else if (is_pointer_value(env, insn->dst_reg)) {
verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
return -EACCES;
}
if (log_level)
- print_verifier_state(&env->cur_state);
+ print_verifier_state(this_branch);
return 0;
}
@@ -1742,9 +2125,9 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
}
/* verify BPF_LD_IMM64 instruction */
-static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
+static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = env->cur_state.regs;
int err;
if (BPF_SIZE(insn->code) != BPF_DW) {
@@ -1760,9 +2143,19 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
- if (insn->src_reg == 0)
- /* generic move 64-bit immediate into a register */
+ if (insn->src_reg == 0) {
+ /* generic move 64-bit immediate into a register,
+ * only analyzer needs to collect the ld_imm value.
+ */
+ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+
+ if (!env->analyzer_ops)
+ return 0;
+
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = imm;
return 0;
+ }
/* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
@@ -1799,11 +2192,11 @@ static bool may_access_skb(enum bpf_prog_type type)
* Output:
* R0 - 8/16/32-bit skb data converted to cpu endianness
*/
-static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
+static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = env->cur_state.regs;
u8 mode = BPF_MODE(insn->code);
- struct reg_state *reg;
+ struct bpf_reg_state *reg;
int i, err;
if (!may_access_skb(env->prog->type)) {
@@ -1889,7 +2282,7 @@ enum {
BRANCH = 2,
};
-#define STATE_LIST_MARK ((struct verifier_state_list *) -1L)
+#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L)
static int *insn_stack; /* stack of insns to process */
static int cur_stack; /* current stack index */
@@ -1900,7 +2293,7 @@ static int *insn_state;
* w - next instruction
* e - edge
*/
-static int push_insn(int t, int w, int e, struct verifier_env *env)
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
{
if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
return 0;
@@ -1941,7 +2334,7 @@ static int push_insn(int t, int w, int e, struct verifier_env *env)
/* non-recursive depth-first-search to detect loops in BPF program
* loop == back-edge in directed graph
*/
-static int check_cfg(struct verifier_env *env)
+static int check_cfg(struct bpf_verifier_env *env)
{
struct bpf_insn *insns = env->prog->insnsi;
int insn_cnt = env->prog->len;
@@ -2050,7 +2443,8 @@ err_free:
/* the following conditions reduce the number of explored insns
* from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
*/
-static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur)
+static bool compare_ptrs_to_packet(struct bpf_reg_state *old,
+ struct bpf_reg_state *cur)
{
if (old->id != cur->id)
return false;
@@ -2125,9 +2519,12 @@ static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur)
* whereas register type in current state is meaningful, it means that
* the current state will reach 'bpf_exit' instruction safely
*/
-static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
+static bool states_equal(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur)
{
- struct reg_state *rold, *rcur;
+ bool varlen_map_access = env->varlen_map_value_access;
+ struct bpf_reg_state *rold, *rcur;
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
@@ -2137,8 +2534,20 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
if (memcmp(rold, rcur, sizeof(*rold)) == 0)
continue;
+ /* If the ranges were not the same, but everything else was and
+ * we didn't do a variable access into a map then we are a-ok.
+ */
+ if (!varlen_map_access &&
+ memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0)
+ continue;
+
+ /* If we didn't map access then again we don't care about the
+ * mismatched range values and it's ok if our old type was
+ * UNKNOWN and we didn't go to a NOT_INIT'ed reg.
+ */
if (rold->type == NOT_INIT ||
- (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT))
+ (!varlen_map_access && rold->type == UNKNOWN_VALUE &&
+ rcur->type != NOT_INIT))
continue;
if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
@@ -2167,9 +2576,9 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
* the same, check that stored pointers types
* are the same as well.
* Ex: explored safe path could have stored
- * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8}
* but current path has stored:
- * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16}
* such verifier states are not equivalent.
* return false to continue verification of this path
*/
@@ -2180,10 +2589,10 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
return true;
}
-static int is_state_visited(struct verifier_env *env, int insn_idx)
+static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
- struct verifier_state_list *new_sl;
- struct verifier_state_list *sl;
+ struct bpf_verifier_state_list *new_sl;
+ struct bpf_verifier_state_list *sl;
sl = env->explored_states[insn_idx];
if (!sl)
@@ -2193,7 +2602,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx)
return 0;
while (sl != STATE_LIST_MARK) {
- if (states_equal(&sl->state, &env->cur_state))
+ if (states_equal(env, &sl->state, &env->cur_state))
/* reached equivalent register/stack state,
* prune the search
*/
@@ -2207,7 +2616,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx)
* it will be rejected. Since there are no loops, we won't be
* seeing this 'insn_idx' instruction again on the way to bpf_exit
*/
- new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER);
+ new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER);
if (!new_sl)
return -ENOMEM;
@@ -2218,11 +2627,20 @@ static int is_state_visited(struct verifier_env *env, int insn_idx)
return 0;
}
-static int do_check(struct verifier_env *env)
+static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
+ int insn_idx, int prev_insn_idx)
+{
+ if (!env->analyzer_ops || !env->analyzer_ops->insn_hook)
+ return 0;
+
+ return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx);
+}
+
+static int do_check(struct bpf_verifier_env *env)
{
- struct verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = &env->cur_state;
struct bpf_insn *insns = env->prog->insnsi;
- struct reg_state *regs = state->regs;
+ struct bpf_reg_state *regs = state->regs;
int insn_cnt = env->prog->len;
int insn_idx, prev_insn_idx = 0;
int insn_processed = 0;
@@ -2230,6 +2648,7 @@ static int do_check(struct verifier_env *env)
init_reg_state(regs);
insn_idx = 0;
+ env->varlen_map_value_access = false;
for (;;) {
struct bpf_insn *insn;
u8 class;
@@ -2276,13 +2695,17 @@ static int do_check(struct verifier_env *env)
print_bpf_insn(insn);
}
+ err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
+ if (err)
+ return err;
+
if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
if (err)
return err;
} else if (class == BPF_LDX) {
- enum bpf_reg_type src_reg_type;
+ enum bpf_reg_type *prev_src_type, src_reg_type;
/* check for reserved fields is already done */
@@ -2306,21 +2729,25 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
- if (BPF_SIZE(insn->code) != BPF_W) {
+ reset_reg_range_values(regs, insn->dst_reg);
+ if (BPF_SIZE(insn->code) != BPF_W &&
+ BPF_SIZE(insn->code) != BPF_DW) {
insn_idx++;
continue;
}
- if (insn->imm == 0) {
+ prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
+
+ if (*prev_src_type == NOT_INIT) {
/* saw a valid insn
* dst_reg = *(u32 *)(src_reg + off)
- * use reserved 'imm' field to mark this insn
+ * save type to validate intersecting paths
*/
- insn->imm = src_reg_type;
+ *prev_src_type = src_reg_type;
- } else if (src_reg_type != insn->imm &&
+ } else if (src_reg_type != *prev_src_type &&
(src_reg_type == PTR_TO_CTX ||
- insn->imm == PTR_TO_CTX)) {
+ *prev_src_type == PTR_TO_CTX)) {
/* ABuser program is trying to use the same insn
* dst_reg = *(u32*) (src_reg + off)
* with different pointer types:
@@ -2333,7 +2760,7 @@ static int do_check(struct verifier_env *env)
}
} else if (class == BPF_STX) {
- enum bpf_reg_type dst_reg_type;
+ enum bpf_reg_type *prev_dst_type, dst_reg_type;
if (BPF_MODE(insn->code) == BPF_XADD) {
err = check_xadd(env, insn);
@@ -2361,11 +2788,13 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
- if (insn->imm == 0) {
- insn->imm = dst_reg_type;
- } else if (dst_reg_type != insn->imm &&
+ prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type;
+
+ if (*prev_dst_type == NOT_INIT) {
+ *prev_dst_type = dst_reg_type;
+ } else if (dst_reg_type != *prev_dst_type &&
(dst_reg_type == PTR_TO_CTX ||
- insn->imm == PTR_TO_CTX)) {
+ *prev_dst_type == PTR_TO_CTX)) {
verbose("same insn cannot be used with different pointers\n");
return -EINVAL;
}
@@ -2471,6 +2900,7 @@ process_bpf_exit:
verbose("invalid BPF_LD mode\n");
return -EINVAL;
}
+ reset_reg_range_values(regs, insn->dst_reg);
} else {
verbose("unknown insn class %d\n", class);
return -EINVAL;
@@ -2483,14 +2913,32 @@ process_bpf_exit:
return 0;
}
+static int check_map_prog_compatibility(struct bpf_map *map,
+ struct bpf_prog *prog)
+
+{
+ if (prog->type == BPF_PROG_TYPE_PERF_EVENT &&
+ (map->map_type == BPF_MAP_TYPE_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_HASH) &&
+ (map->map_flags & BPF_F_NO_PREALLOC)) {
+ verbose("perf_event programs can only use preallocated hash map\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
/* look for pseudo eBPF instructions that access map FDs and
* replace them with actual map pointers
*/
-static int replace_map_fd_with_map_ptr(struct verifier_env *env)
+static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
- int i, j;
+ int i, j, err;
+
+ err = bpf_prog_calc_tag(env->prog);
+ if (err)
+ return err;
for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
@@ -2534,6 +2982,12 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
return PTR_ERR(map);
}
+ err = check_map_prog_compatibility(map, env->prog);
+ if (err) {
+ fdput(f);
+ return err;
+ }
+
/* store map pointer inside BPF_LD_IMM64 instruction */
insn[0].imm = (u32) (unsigned long) map;
insn[1].imm = ((u64) (unsigned long) map) >> 32;
@@ -2577,7 +3031,7 @@ next_insn:
}
/* drop refcnt of maps used by the rejected program */
-static void release_maps(struct verifier_env *env)
+static void release_maps(struct bpf_verifier_env *env)
{
int i;
@@ -2586,7 +3040,7 @@ static void release_maps(struct verifier_env *env)
}
/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
-static void convert_pseudo_ld_imm64(struct verifier_env *env)
+static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
@@ -2600,62 +3054,74 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
/* convert load instructions that access fields of 'struct __sk_buff'
* into sequence of instructions that access fields of 'struct sk_buff'
*/
-static int convert_ctx_accesses(struct verifier_env *env)
+static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
- struct bpf_insn *insn = env->prog->insnsi;
- int insn_cnt = env->prog->len;
- struct bpf_insn insn_buf[16];
+ const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+ const int insn_cnt = env->prog->len;
+ struct bpf_insn insn_buf[16], *insn;
struct bpf_prog *new_prog;
enum bpf_access_type type;
- int i;
+ int i, cnt, delta = 0;
+
+ if (ops->gen_prologue) {
+ cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
+ env->prog);
+ if (cnt >= ARRAY_SIZE(insn_buf)) {
+ verbose("bpf verifier is misconfigured\n");
+ return -EINVAL;
+ } else if (cnt) {
+ new_prog = bpf_patch_insn_single(env->prog, 0,
+ insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = new_prog;
+ delta += cnt - 1;
+ }
+ }
- if (!env->prog->aux->ops->convert_ctx_access)
+ if (!ops->convert_ctx_access)
return 0;
- for (i = 0; i < insn_cnt; i++, insn++) {
- u32 insn_delta, cnt;
+ insn = env->prog->insnsi + delta;
- if (insn->code == (BPF_LDX | BPF_MEM | BPF_W))
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
type = BPF_READ;
- else if (insn->code == (BPF_STX | BPF_MEM | BPF_W))
+ else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_DW))
type = BPF_WRITE;
else
continue;
- if (insn->imm != PTR_TO_CTX) {
- /* clear internal mark */
- insn->imm = 0;
+ if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX)
continue;
- }
- cnt = env->prog->aux->ops->
- convert_ctx_access(type, insn->dst_reg, insn->src_reg,
- insn->off, insn_buf, env->prog);
+ cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg,
+ insn->off, insn_buf, env->prog);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
verbose("bpf verifier is misconfigured\n");
return -EINVAL;
}
- new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt);
+ new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf,
+ cnt);
if (!new_prog)
return -ENOMEM;
- insn_delta = cnt - 1;
+ delta += cnt - 1;
/* keep walking new program and skip insns we just inserted */
env->prog = new_prog;
- insn = new_prog->insnsi + i + insn_delta;
-
- insn_cnt += insn_delta;
- i += insn_delta;
+ insn = new_prog->insnsi + i + delta;
}
return 0;
}
-static void free_states(struct verifier_env *env)
+static void free_states(struct bpf_verifier_env *env)
{
- struct verifier_state_list *sl, *sln;
+ struct bpf_verifier_state_list *sl, *sln;
int i;
if (!env->explored_states)
@@ -2678,19 +3144,21 @@ static void free_states(struct verifier_env *env)
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
{
char __user *log_ubuf = NULL;
- struct verifier_env *env;
+ struct bpf_verifier_env *env;
int ret = -EINVAL;
- if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
- return -E2BIG;
-
- /* 'struct verifier_env' can be global, but since it's not small,
+ /* 'struct bpf_verifier_env' can be global, but since it's not small,
* allocate/free it every time bpf_check() is called
*/
- env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
+ env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
+ env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
+ (*prog)->len);
+ ret = -ENOMEM;
+ if (!env->insn_aux_data)
+ goto err_free_env;
env->prog = *prog;
/* grab the mutex to protect few globals used by verifier */
@@ -2709,12 +3177,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
/* log_* values have to be sane */
if (log_size < 128 || log_size > UINT_MAX >> 8 ||
log_level == 0 || log_ubuf == NULL)
- goto free_env;
+ goto err_unlock;
ret = -ENOMEM;
log_buf = vmalloc(log_size);
if (!log_buf)
- goto free_env;
+ goto err_unlock;
} else {
log_level = 0;
}
@@ -2724,7 +3192,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
goto skip_full_check;
env->explored_states = kcalloc(env->prog->len,
- sizeof(struct verifier_state_list *),
+ sizeof(struct bpf_verifier_state_list *),
GFP_USER);
ret = -ENOMEM;
if (!env->explored_states)
@@ -2783,14 +3251,67 @@ skip_full_check:
free_log_buf:
if (log_level)
vfree(log_buf);
-free_env:
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
* them now. Otherwise free_bpf_prog_info() will release them.
*/
release_maps(env);
*prog = env->prog;
+err_unlock:
+ mutex_unlock(&bpf_verifier_lock);
+ vfree(env->insn_aux_data);
+err_free_env:
kfree(env);
+ return ret;
+}
+
+int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
+ void *priv)
+{
+ struct bpf_verifier_env *env;
+ int ret;
+
+ env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
+ if (!env)
+ return -ENOMEM;
+
+ env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
+ prog->len);
+ ret = -ENOMEM;
+ if (!env->insn_aux_data)
+ goto err_free_env;
+ env->prog = prog;
+ env->analyzer_ops = ops;
+ env->analyzer_priv = priv;
+
+ /* grab the mutex to protect few globals used by verifier */
+ mutex_lock(&bpf_verifier_lock);
+
+ log_level = 0;
+
+ env->explored_states = kcalloc(env->prog->len,
+ sizeof(struct bpf_verifier_state_list *),
+ GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!env->explored_states)
+ goto skip_full_check;
+
+ ret = check_cfg(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
+
+ ret = do_check(env);
+
+skip_full_check:
+ while (pop_stack(env, NULL) >= 0);
+ free_states(env);
+
mutex_unlock(&bpf_verifier_lock);
+ vfree(env->insn_aux_data);
+err_free_env:
+ kfree(env);
return ret;
}
+EXPORT_SYMBOL_GPL(bpf_analyzer);
diff --git a/kernel/capability.c b/kernel/capability.c
index 00411c82dac5..f97fe77ceb88 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -17,7 +17,7 @@
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
* Leveraged for setting/resetting capabilities
@@ -318,6 +318,7 @@ bool has_capability(struct task_struct *t, int cap)
{
return has_ns_capability(t, &init_user_ns, cap);
}
+EXPORT_SYMBOL(has_capability);
/**
* has_ns_capability_noaudit - Does a task have a capability (unaudited)
@@ -457,6 +458,19 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns,
EXPORT_SYMBOL(file_ns_capable);
/**
+ * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
+ * @ns: The user namespace in question
+ * @inode: The inode in question
+ *
+ * Return true if the inode uid and gid are within the namespace.
+ */
+bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode)
+{
+ return kuid_has_mapping(ns, inode->i_uid) &&
+ kgid_has_mapping(ns, inode->i_gid);
+}
+
+/**
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
* @inode: The inode in question
* @cap: The capability in question
@@ -469,7 +483,26 @@ bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
- return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
- kgid_has_mapping(ns, inode->i_gid);
+ return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
+
+/**
+ * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
+ * @tsk: The task that may be ptraced
+ * @ns: The user namespace to search for CAP_SYS_PTRACE in
+ *
+ * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
+ * in the specified user namespace.
+ */
+bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
+{
+ int ret = 0; /* An absent tracer adds no restrictions */
+ const struct cred *cred;
+ rcu_read_lock();
+ cred = rcu_dereference(tsk->ptracer_cred);
+ if (cred)
+ ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE);
+ rcu_read_unlock();
+ return (ret == 0);
+}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d6b729beba49..53bbca7c4859 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -64,6 +64,9 @@
#include <linux/file.h>
#include <net/sock.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/cgroup.h>
+
/*
* pidlists linger the following amount before being destroyed. The goal
* is avoiding frequent destruction in the middle of consecutive read calls
@@ -1176,6 +1179,8 @@ static void cgroup_destroy_root(struct cgroup_root *root)
struct cgroup *cgrp = &root->cgrp;
struct cgrp_cset_link *link, *tmp_link;
+ trace_cgroup_destroy_root(root);
+
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
BUG_ON(atomic_read(&root->nr_cgrps));
@@ -1874,6 +1879,9 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
strcpy(root->release_agent_path, opts.release_agent);
spin_unlock(&release_agent_path_lock);
}
+
+ trace_cgroup_remount(root);
+
out_unlock:
kfree(opts.release_agent);
kfree(opts.name);
@@ -2031,6 +2039,8 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
if (ret)
goto destroy_root;
+ trace_cgroup_setup_root(root);
+
/*
* There must be no failure case after here, since rebinding takes
* care of subsystems' refcounts, which are explicitly dropped in
@@ -2315,22 +2325,18 @@ static struct file_system_type cgroup2_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
-static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
- struct cgroup_namespace *ns)
+static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+ struct cgroup_namespace *ns)
{
struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
- int ret;
- ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
- if (ret < 0 || ret >= buflen)
- return NULL;
- return buf;
+ return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
}
-char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
- struct cgroup_namespace *ns)
+int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
+ struct cgroup_namespace *ns)
{
- char *ret;
+ int ret;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
@@ -2357,12 +2363,12 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
*
* Return value is the same as kernfs_path().
*/
-char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
+int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
{
struct cgroup_root *root;
struct cgroup *cgrp;
int hierarchy_id = 1;
- char *path = NULL;
+ int ret;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
@@ -2371,16 +2377,15 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
if (root) {
cgrp = task_cgroup_from_root(task, root);
- path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
+ ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
} else {
/* if no hierarchy exists, everyone is in "/" */
- if (strlcpy(buf, "/", buflen) < buflen)
- path = buf;
+ ret = strlcpy(buf, "/", buflen);
}
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
- return path;
+ return ret;
}
EXPORT_SYMBOL_GPL(task_cgroup_path);
@@ -2830,6 +2835,10 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
cgroup_migrate_finish(&preloaded_csets);
+
+ if (!ret)
+ trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
+
return ret;
}
@@ -3611,6 +3620,8 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
mutex_lock(&cgroup_mutex);
ret = kernfs_rename(kn, new_parent, new_name_str);
+ if (!ret)
+ trace_cgroup_rename(cgrp);
mutex_unlock(&cgroup_mutex);
@@ -4381,6 +4392,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
if (task) {
ret = cgroup_migrate(task, false, to->root);
+ if (!ret)
+ trace_cgroup_transfer_tasks(to, task, false);
put_task_struct(task);
}
} while (task && !ret);
@@ -5046,6 +5059,8 @@ static void css_release_work_fn(struct work_struct *work)
ss->css_released(css);
} else {
/* cgroup release path */
+ trace_cgroup_release(cgrp);
+
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
@@ -5059,6 +5074,8 @@ static void css_release_work_fn(struct work_struct *work)
if (cgrp->kn)
RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
NULL);
+
+ cgroup_bpf_put(cgrp);
}
mutex_unlock(&cgroup_mutex);
@@ -5204,6 +5221,11 @@ err_free_css:
return ERR_PTR(err);
}
+/*
+ * The returned cgroup is fully initialized including its control mask, but
+ * it isn't associated with its kernfs_node and doesn't have the control
+ * mask applied.
+ */
static struct cgroup *cgroup_create(struct cgroup *parent)
{
struct cgroup_root *root = parent->root;
@@ -5266,12 +5288,10 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
if (!cgroup_on_dfl(cgrp))
cgrp->subtree_control = cgroup_control(cgrp);
- cgroup_propagate_control(cgrp);
+ if (parent)
+ cgroup_bpf_inherit(cgrp, parent);
- /* @cgrp doesn't have dir yet so the following will only create csses */
- ret = cgroup_apply_control_enable(cgrp);
- if (ret)
- goto out_destroy;
+ cgroup_propagate_control(cgrp);
return cgrp;
@@ -5280,9 +5300,6 @@ out_cancel_ref:
out_free_cgrp:
kfree(cgrp);
return ERR_PTR(ret);
-out_destroy:
- cgroup_destroy_locked(cgrp);
- return ERR_PTR(ret);
}
static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
@@ -5332,6 +5349,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
if (ret)
goto out_destroy;
+ trace_cgroup_mkdir(cgrp);
+
/* let's create and online css's */
kernfs_activate(kn);
@@ -5507,6 +5526,9 @@ static int cgroup_rmdir(struct kernfs_node *kn)
ret = cgroup_destroy_locked(cgrp);
+ if (!ret)
+ trace_cgroup_rmdir(cgrp);
+
cgroup_kn_unlock(kn);
return ret;
}
@@ -5627,6 +5649,12 @@ int __init cgroup_init(void)
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
+ /*
+ * The latency of the synchronize_sched() is too high for cgroups,
+ * avoid it at the cost of forcing all readers into the slow path.
+ */
+ rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
+
get_user_ns(init_cgroup_ns.user_ns);
mutex_lock(&cgroup_mutex);
@@ -5737,7 +5765,7 @@ core_initcall(cgroup_wq_init);
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk)
{
- char *buf, *path;
+ char *buf;
int retval;
struct cgroup_root *root;
@@ -5780,18 +5808,18 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
* " (deleted)" is appended to the cgroup path.
*/
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
- path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
+ retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
- if (!path) {
+ if (retval >= PATH_MAX)
retval = -ENAMETOOLONG;
+ if (retval < 0)
goto out_unlock;
- }
+
+ seq_puts(m, buf);
} else {
- path = "/";
+ seq_puts(m, "/");
}
- seq_puts(m, path);
-
if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
seq_puts(m, " (deleted)\n");
else
@@ -6056,8 +6084,9 @@ static void cgroup_release_agent(struct work_struct *work)
{
struct cgroup *cgrp =
container_of(work, struct cgroup, release_agent_work);
- char *pathbuf = NULL, *agentbuf = NULL, *path;
+ char *pathbuf = NULL, *agentbuf = NULL;
char *argv[3], *envp[3];
+ int ret;
mutex_lock(&cgroup_mutex);
@@ -6067,13 +6096,13 @@ static void cgroup_release_agent(struct work_struct *work)
goto out;
spin_lock_irq(&css_set_lock);
- path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+ ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
spin_unlock_irq(&css_set_lock);
- if (!path)
+ if (ret < 0 || ret >= PATH_MAX)
goto out;
argv[0] = agentbuf;
- argv[1] = path;
+ argv[1] = pathbuf;
argv[2] = NULL;
/* minimal command environment */
@@ -6322,6 +6351,16 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
/* cgroup namespaces */
+static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
+}
+
+static void dec_cgroup_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
+}
+
static struct cgroup_namespace *alloc_cgroup_ns(void)
{
struct cgroup_namespace *new_ns;
@@ -6343,6 +6382,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
void free_cgroup_ns(struct cgroup_namespace *ns)
{
put_css_set(ns->root_cset);
+ dec_cgroup_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
kfree(ns);
@@ -6354,6 +6394,7 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
struct cgroup_namespace *old_ns)
{
struct cgroup_namespace *new_ns;
+ struct ucounts *ucounts;
struct css_set *cset;
BUG_ON(!old_ns);
@@ -6367,6 +6408,10 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
+ ucounts = inc_cgroup_namespaces(user_ns);
+ if (!ucounts)
+ return ERR_PTR(-ENOSPC);
+
/* It is not safe to take cgroup_mutex here */
spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
@@ -6376,10 +6421,12 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
new_ns = alloc_cgroup_ns();
if (IS_ERR(new_ns)) {
put_css_set(cset);
+ dec_cgroup_namespaces(ucounts);
return new_ns;
}
new_ns->user_ns = get_user_ns(user_ns);
+ new_ns->ucounts = ucounts;
new_ns->root_cset = cset;
return new_ns;
@@ -6430,12 +6477,18 @@ static void cgroupns_put(struct ns_common *ns)
put_cgroup_ns(to_cg_ns(ns));
}
+static struct user_namespace *cgroupns_owner(struct ns_common *ns)
+{
+ return to_cg_ns(ns)->user_ns;
+}
+
const struct proc_ns_operations cgroupns_operations = {
.name = "cgroup",
.type = CLONE_NEWCGROUP,
.get = cgroupns_get,
.put = cgroupns_put,
.install = cgroupns_install,
+ .owner = cgroupns_owner,
};
static __init int cgroup_namespaces_init(void)
@@ -6444,6 +6497,20 @@ static __init int cgroup_namespaces_init(void)
}
subsys_initcall(cgroup_namespaces_init);
+#ifdef CONFIG_CGROUP_BPF
+int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, bool overridable)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+#endif /* CONFIG_CGROUP_BPF */
+
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state *parent_css)
diff --git a/kernel/compat.c b/kernel/compat.c
index 333d364be29d..19aec5d98108 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -28,7 +28,7 @@
#include <linux/ptrace.h>
#include <linux/gfp.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
{
@@ -307,12 +307,17 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o,
__put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
}
+asmlinkage long sys_ni_posix_timers(void);
+
COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
struct compat_itimerval __user *, it)
{
struct itimerval kit;
int error;
+ if (!IS_ENABLED(CONFIG_POSIX_TIMERS))
+ return sys_ni_posix_timers();
+
error = do_getitimer(which, &kit);
if (!error && put_compat_itimerval(it, &kit))
error = -EFAULT;
@@ -326,6 +331,9 @@ COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
struct itimerval kin, kout;
int error;
+ if (!IS_ENABLED(CONFIG_POSIX_TIMERS))
+ return sys_ni_posix_timers();
+
if (in) {
if (get_compat_itimerval(&kin, in))
return -EFAULT;
diff --git a/kernel/configs.c b/kernel/configs.c
index c18b1f1ae515..2df132b20217 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -28,7 +28,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/**************************************************/
/* the actual current config file */
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index 9f748ed7bea8..1a8f34f63601 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -11,7 +11,6 @@ CONFIG_ANDROID_LOW_MEMORY_KILLER=y
CONFIG_ARMV8_DEPRECATED=y
CONFIG_ASHMEM=y
CONFIG_AUDIT=y
-CONFIG_BLK_DEV_DM=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_CGROUPS=y
CONFIG_CGROUP_CPUACCT=y
@@ -19,9 +18,7 @@ CONFIG_CGROUP_DEBUG=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CGROUP_SCHED=y
CONFIG_CP15_BARRIER_EMULATION=y
-CONFIG_DM_CRYPT=y
-CONFIG_DM_VERITY=y
-CONFIG_DM_VERITY_FEC=y
+CONFIG_DEFAULT_SECURITY_SELINUX=y
CONFIG_EMBEDDED=y
CONFIG_FB=y
CONFIG_HIGH_RES_TIMERS=y
@@ -41,7 +38,6 @@ CONFIG_IPV6=y
CONFIG_IPV6_MIP6=y
CONFIG_IPV6_MULTIPLE_TABLES=y
CONFIG_IPV6_OPTIMISTIC_DAD=y
-CONFIG_IPV6_PRIVACY=y
CONFIG_IPV6_ROUTER_PREF=y
CONFIG_IPV6_ROUTE_INFO=y
CONFIG_IP_ADVANCED_ROUTER=y
@@ -135,6 +131,7 @@ CONFIG_PREEMPT=y
CONFIG_QUOTA=y
CONFIG_RTC_CLASS=y
CONFIG_RT_GROUP_SCHED=y
+CONFIG_SECCOMP=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index e3b953e966d2..297756be369c 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -6,12 +6,16 @@
# CONFIG_PM_WAKELOCKS_GC is not set
# CONFIG_VT is not set
CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BLK_DEV_DM=y
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_SIZE=8192
CONFIG_COMPACTION=y
CONFIG_DEBUG_RODATA=y
+CONFIG_DM_CRYPT=y
CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
CONFIG_DRAGONRISE_FF=y
CONFIG_ENABLE_DEFAULT_TRACERS=y
CONFIG_EXT4_FS=y
diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config
new file mode 100644
index 000000000000..8d9643767142
--- /dev/null
+++ b/kernel/configs/kvm_guest.config
@@ -0,0 +1,32 @@
+CONFIG_NET=y
+CONFIG_NET_CORE=y
+CONFIG_NETDEVICES=y
+CONFIG_BLOCK=y
+CONFIG_BLK_DEV=y
+CONFIG_NETWORK_FILESYSTEMS=y
+CONFIG_INET=y
+CONFIG_TTY=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_BINFMT_ELF=y
+CONFIG_PCI=y
+CONFIG_PCI_MSI=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_VIRTUALIZATION=y
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PARAVIRT=y
+CONFIG_KVM_GUEST=y
+CONFIG_VIRTIO=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_VIRTIO_BLK=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_VIRTIO_NET=y
+CONFIG_9P_FS=y
+CONFIG_NET_9P=y
+CONFIG_NET_9P_VIRTIO=y
+CONFIG_SCSI_LOWLEVEL=y
+CONFIG_SCSI_VIRTIO=y
+CONFIG_VIRTIO_INPUT=y
+CONFIG_DRM_VIRTIO_GPU=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 341bf80f80bd..0a5f630f5c54 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -23,6 +23,8 @@
#include <linux/tick.h>
#include <linux/irq.h>
#include <linux/smpboot.h>
+#include <linux/relay.h>
+#include <linux/slab.h>
#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
@@ -37,8 +39,9 @@
* @thread: Pointer to the hotplug thread
* @should_run: Thread should execute
* @rollback: Perform a rollback
- * @cb_stat: The state for a single callback (install/uninstall)
- * @cb: Single callback function (install/uninstall)
+ * @single: Single callback invocation
+ * @bringup: Single callback bringup or teardown selector
+ * @cb_state: The state for a single callback (install/uninstall)
* @result: Result of the operation
* @done: Signal completion to the issuer of the task
*/
@@ -49,8 +52,10 @@ struct cpuhp_cpu_state {
struct task_struct *thread;
bool should_run;
bool rollback;
+ bool single;
+ bool bringup;
+ struct hlist_node *node;
enum cpuhp_state cb_state;
- int (*cb)(unsigned int cpu);
int result;
struct completion done;
#endif
@@ -68,35 +73,103 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
* @cant_stop: Bringup/teardown can't be stopped at this step
*/
struct cpuhp_step {
- const char *name;
- int (*startup)(unsigned int cpu);
- int (*teardown)(unsigned int cpu);
- bool skip_onerr;
- bool cant_stop;
+ const char *name;
+ union {
+ int (*single)(unsigned int cpu);
+ int (*multi)(unsigned int cpu,
+ struct hlist_node *node);
+ } startup;
+ union {
+ int (*single)(unsigned int cpu);
+ int (*multi)(unsigned int cpu,
+ struct hlist_node *node);
+ } teardown;
+ struct hlist_head list;
+ bool skip_onerr;
+ bool cant_stop;
+ bool multi_instance;
};
static DEFINE_MUTEX(cpuhp_state_mutex);
static struct cpuhp_step cpuhp_bp_states[];
static struct cpuhp_step cpuhp_ap_states[];
+static bool cpuhp_is_ap_state(enum cpuhp_state state)
+{
+ /*
+ * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
+ * purposes as that state is handled explicitly in cpu_down.
+ */
+ return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
+}
+
+static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
+{
+ struct cpuhp_step *sp;
+
+ sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
+ return sp + state;
+}
+
/**
* cpuhp_invoke_callback _ Invoke the callbacks for a given state
* @cpu: The cpu for which the callback should be invoked
* @step: The step in the state machine
- * @cb: The callback function to invoke
+ * @bringup: True if the bringup callback should be invoked
*
- * Called from cpu hotplug and from the state register machinery
+ * Called from cpu hotplug and from the state register machinery.
*/
-static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step,
- int (*cb)(unsigned int))
+static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
+ bool bringup, struct hlist_node *node)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
- int ret = 0;
-
- if (cb) {
- trace_cpuhp_enter(cpu, st->target, step, cb);
+ struct cpuhp_step *step = cpuhp_get_step(state);
+ int (*cbm)(unsigned int cpu, struct hlist_node *node);
+ int (*cb)(unsigned int cpu);
+ int ret, cnt;
+
+ if (!step->multi_instance) {
+ cb = bringup ? step->startup.single : step->teardown.single;
+ if (!cb)
+ return 0;
+ trace_cpuhp_enter(cpu, st->target, state, cb);
ret = cb(cpu);
- trace_cpuhp_exit(cpu, st->state, step, ret);
+ trace_cpuhp_exit(cpu, st->state, state, ret);
+ return ret;
+ }
+ cbm = bringup ? step->startup.multi : step->teardown.multi;
+ if (!cbm)
+ return 0;
+
+ /* Single invocation for instance add/remove */
+ if (node) {
+ trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
+ ret = cbm(cpu, node);
+ trace_cpuhp_exit(cpu, st->state, state, ret);
+ return ret;
+ }
+
+ /* State transition. Invoke on all instances */
+ cnt = 0;
+ hlist_for_each(node, &step->list) {
+ trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
+ ret = cbm(cpu, node);
+ trace_cpuhp_exit(cpu, st->state, state, ret);
+ if (ret)
+ goto err;
+ cnt++;
+ }
+ return 0;
+err:
+ /* Rollback the instances if one failed */
+ cbm = !bringup ? step->startup.multi : step->teardown.multi;
+ if (!cbm)
+ return ret;
+
+ hlist_for_each(node, &step->list) {
+ if (!cnt--)
+ break;
+ cbm(cpu, node);
}
return ret;
}
@@ -110,23 +183,16 @@ EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen);
/*
* The following two APIs (cpu_maps_update_begin/done) must be used when
* attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
- * The APIs cpu_notifier_register_begin/done() must be used to protect CPU
- * hotplug callback (un)registration performed using __register_cpu_notifier()
- * or __unregister_cpu_notifier().
*/
void cpu_maps_update_begin(void)
{
mutex_lock(&cpu_add_remove_lock);
}
-EXPORT_SYMBOL(cpu_notifier_register_begin);
void cpu_maps_update_done(void)
{
mutex_unlock(&cpu_add_remove_lock);
}
-EXPORT_SYMBOL(cpu_notifier_register_done);
-
-static RAW_NOTIFIER_HEAD(cpu_chain);
/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
* Should always be manipulated under cpu_add_remove_lock
@@ -155,7 +221,7 @@ static struct {
.wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
#ifdef CONFIG_DEBUG_LOCK_ALLOC
- .dep_map = {.name = "cpu_hotplug.lock" },
+ .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map),
#endif
};
@@ -260,81 +326,23 @@ void cpu_hotplug_disable(void)
}
EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
-void cpu_hotplug_enable(void)
+static void __cpu_hotplug_enable(void)
{
- cpu_maps_update_begin();
- WARN_ON(--cpu_hotplug_disabled < 0);
- cpu_maps_update_done();
+ if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
+ return;
+ cpu_hotplug_disabled--;
}
-EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
-#endif /* CONFIG_HOTPLUG_CPU */
-/* Need to know about CPUs going up/down? */
-int register_cpu_notifier(struct notifier_block *nb)
+void cpu_hotplug_enable(void)
{
- int ret;
cpu_maps_update_begin();
- ret = raw_notifier_chain_register(&cpu_chain, nb);
+ __cpu_hotplug_enable();
cpu_maps_update_done();
- return ret;
-}
-
-int __register_cpu_notifier(struct notifier_block *nb)
-{
- return raw_notifier_chain_register(&cpu_chain, nb);
-}
-
-static int __cpu_notify(unsigned long val, unsigned int cpu, int nr_to_call,
- int *nr_calls)
-{
- unsigned long mod = cpuhp_tasks_frozen ? CPU_TASKS_FROZEN : 0;
- void *hcpu = (void *)(long)cpu;
-
- int ret;
-
- ret = __raw_notifier_call_chain(&cpu_chain, val | mod, hcpu, nr_to_call,
- nr_calls);
-
- return notifier_to_errno(ret);
-}
-
-static int cpu_notify(unsigned long val, unsigned int cpu)
-{
- return __cpu_notify(val, cpu, -1, NULL);
-}
-
-static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
-{
- BUG_ON(cpu_notify(val, cpu));
}
+EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
+#endif /* CONFIG_HOTPLUG_CPU */
/* Notifier wrappers for transitioning to state machine */
-static int notify_prepare(unsigned int cpu)
-{
- int nr_calls = 0;
- int ret;
-
- ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls);
- if (ret) {
- nr_calls--;
- printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
- __func__, cpu);
- __cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL);
- }
- return ret;
-}
-
-static int notify_online(unsigned int cpu)
-{
- cpu_notify(CPU_ONLINE, cpu);
- return 0;
-}
-
-static int notify_starting(unsigned int cpu)
-{
- cpu_notify(CPU_STARTING, cpu);
- return 0;
-}
static int bringup_wait_for_ap(unsigned int cpu)
{
@@ -349,12 +357,18 @@ static int bringup_cpu(unsigned int cpu)
struct task_struct *idle = idle_thread_get(cpu);
int ret;
+ /*
+ * Some architectures have to walk the irq descriptors to
+ * setup the vector space for the cpu which comes online.
+ * Prevent irq alloc/free across the bringup.
+ */
+ irq_lock_sparse();
+
/* Arch-specific enabling code. */
ret = __cpu_up(cpu, idle);
- if (ret) {
- cpu_notify(CPU_UP_CANCELED, cpu);
+ irq_unlock_sparse();
+ if (ret)
return ret;
- }
ret = bringup_wait_for_ap(cpu);
BUG_ON(!cpu_online(cpu));
return ret;
@@ -363,62 +377,55 @@ static int bringup_cpu(unsigned int cpu)
/*
* Hotplug state machine related functions
*/
-static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st,
- struct cpuhp_step *steps)
+static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
{
for (st->state++; st->state < st->target; st->state++) {
- struct cpuhp_step *step = steps + st->state;
+ struct cpuhp_step *step = cpuhp_get_step(st->state);
if (!step->skip_onerr)
- cpuhp_invoke_callback(cpu, st->state, step->startup);
+ cpuhp_invoke_callback(cpu, st->state, true, NULL);
}
}
static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
- struct cpuhp_step *steps, enum cpuhp_state target)
+ enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
int ret = 0;
for (; st->state > target; st->state--) {
- struct cpuhp_step *step = steps + st->state;
-
- ret = cpuhp_invoke_callback(cpu, st->state, step->teardown);
+ ret = cpuhp_invoke_callback(cpu, st->state, false, NULL);
if (ret) {
st->target = prev_state;
- undo_cpu_down(cpu, st, steps);
+ undo_cpu_down(cpu, st);
break;
}
}
return ret;
}
-static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st,
- struct cpuhp_step *steps)
+static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
{
for (st->state--; st->state > st->target; st->state--) {
- struct cpuhp_step *step = steps + st->state;
+ struct cpuhp_step *step = cpuhp_get_step(st->state);
if (!step->skip_onerr)
- cpuhp_invoke_callback(cpu, st->state, step->teardown);
+ cpuhp_invoke_callback(cpu, st->state, false, NULL);
}
}
static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
- struct cpuhp_step *steps, enum cpuhp_state target)
+ enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
int ret = 0;
while (st->state < target) {
- struct cpuhp_step *step;
-
st->state++;
- step = steps + st->state;
- ret = cpuhp_invoke_callback(cpu, st->state, step->startup);
+ ret = cpuhp_invoke_callback(cpu, st->state, true, NULL);
if (ret) {
st->target = prev_state;
- undo_cpu_up(cpu, st, steps);
+ undo_cpu_up(cpu, st);
break;
}
}
@@ -447,13 +454,13 @@ static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
{
enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU);
- return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target);
+ return cpuhp_down_callbacks(cpu, st, target);
}
/* Execute the online startup callbacks. Used to be CPU_ONLINE */
static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st)
{
- return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target);
+ return cpuhp_up_callbacks(cpu, st, st->target);
}
/*
@@ -476,23 +483,20 @@ static void cpuhp_thread_fun(unsigned int cpu)
st->should_run = false;
/* Single callback invocation for [un]install ? */
- if (st->cb) {
+ if (st->single) {
if (st->cb_state < CPUHP_AP_ONLINE) {
local_irq_disable();
- ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
+ ret = cpuhp_invoke_callback(cpu, st->cb_state,
+ st->bringup, st->node);
local_irq_enable();
} else {
- ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
+ ret = cpuhp_invoke_callback(cpu, st->cb_state,
+ st->bringup, st->node);
}
} else if (st->rollback) {
BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
- undo_cpu_down(cpu, st, cpuhp_ap_states);
- /*
- * This is a momentary workaround to keep the notifier users
- * happy. Will go away once we got rid of the notifiers.
- */
- cpu_notify_nofail(CPU_DOWN_FAILED, cpu);
+ undo_cpu_down(cpu, st);
st->rollback = false;
} else {
/* Cannot happen .... */
@@ -509,8 +513,9 @@ static void cpuhp_thread_fun(unsigned int cpu)
}
/* Invoke a single callback on a remote cpu */
-static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
- int (*cb)(unsigned int))
+static int
+cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
+ struct hlist_node *node)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@ -522,10 +527,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
* we invoke the thread function directly.
*/
if (!st->thread)
- return cpuhp_invoke_callback(cpu, state, cb);
+ return cpuhp_invoke_callback(cpu, state, bringup, node);
st->cb_state = state;
- st->cb = cb;
+ st->single = true;
+ st->bringup = bringup;
+ st->node = node;
+
/*
* Make sure the above stores are visible before should_run becomes
* true. Paired with the mb() above in cpuhp_thread_fun()
@@ -541,7 +549,7 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st)
{
st->result = 0;
- st->cb = NULL;
+ st->single = false;
/*
* Make sure the above stores are visible before should_run becomes
* true. Paired with the mb() above in cpuhp_thread_fun()
@@ -579,22 +587,6 @@ void __init cpuhp_threads_init(void)
}
#ifdef CONFIG_HOTPLUG_CPU
-EXPORT_SYMBOL(register_cpu_notifier);
-EXPORT_SYMBOL(__register_cpu_notifier);
-void unregister_cpu_notifier(struct notifier_block *nb)
-{
- cpu_maps_update_begin();
- raw_notifier_chain_unregister(&cpu_chain, nb);
- cpu_maps_update_done();
-}
-EXPORT_SYMBOL(unregister_cpu_notifier);
-
-void __unregister_cpu_notifier(struct notifier_block *nb)
-{
- raw_notifier_chain_unregister(&cpu_chain, nb);
-}
-EXPORT_SYMBOL(__unregister_cpu_notifier);
-
/**
* clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
* @cpu: a CPU id
@@ -660,26 +652,6 @@ static inline void check_for_tasks(int dead_cpu)
read_unlock(&tasklist_lock);
}
-static int notify_down_prepare(unsigned int cpu)
-{
- int err, nr_calls = 0;
-
- err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls);
- if (err) {
- nr_calls--;
- __cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL);
- pr_warn("%s: attempt to take down CPU %u failed\n",
- __func__, cpu);
- }
- return err;
-}
-
-static int notify_dying(unsigned int cpu)
-{
- cpu_notify(CPU_DYING, cpu);
- return 0;
-}
-
/* Take this CPU down. */
static int take_cpu_down(void *_param)
{
@@ -692,12 +664,16 @@ static int take_cpu_down(void *_param)
if (err < 0)
return err;
+ /*
+ * We get here while we are in CPUHP_TEARDOWN_CPU state and we must not
+ * do this step again.
+ */
+ WARN_ON(st->state != CPUHP_TEARDOWN_CPU);
+ st->state--;
/* Invoke the former CPU_DYING callbacks */
- for (; st->state > target; st->state--) {
- struct cpuhp_step *step = cpuhp_ap_states + st->state;
+ for (; st->state > target; st->state--)
+ cpuhp_invoke_callback(cpu, st->state, false, NULL);
- cpuhp_invoke_callback(cpu, st->state, step->teardown);
- }
/* Give up timekeeping duties */
tick_handover_do_timer();
/* Park the stopper thread */
@@ -734,7 +710,7 @@ static int takedown_cpu(unsigned int cpu)
BUG_ON(cpu_online(cpu));
/*
- * The migration_call() CPU_DYING callback will have removed all
+ * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all
* runnable tasks from the cpu, there's only the idle task left now
* that the migration thread is done doing the stop_machine thing.
*
@@ -754,13 +730,6 @@ static int takedown_cpu(unsigned int cpu)
return 0;
}
-static int notify_dead(unsigned int cpu)
-{
- cpu_notify_nofail(CPU_DEAD, cpu);
- check_for_tasks(cpu);
- return 0;
-}
-
static void cpuhp_complete_idle_dead(void *arg)
{
struct cpuhp_cpu_state *st = arg;
@@ -784,10 +753,7 @@ void cpuhp_report_idle_dead(void)
}
#else
-#define notify_down_prepare NULL
#define takedown_cpu NULL
-#define notify_dead NULL
-#define notify_dying NULL
#endif
#ifdef CONFIG_HOTPLUG_CPU
@@ -798,7 +764,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
int prev_state, ret = 0;
- bool hasdied = false;
if (num_online_cpus() == 1)
return -EBUSY;
@@ -836,19 +801,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
* The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
* to do the further cleanups.
*/
- ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target);
+ ret = cpuhp_down_callbacks(cpu, st, target);
if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
st->target = prev_state;
st->rollback = true;
cpuhp_kick_ap_work(cpu);
}
- hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
out:
cpu_hotplug_done();
- /* This post dead nonsense must die */
- if (!ret && hasdied)
- cpu_notify_nofail(CPU_POST_DEAD, cpu);
return ret;
}
@@ -877,10 +838,9 @@ EXPORT_SYMBOL(cpu_down);
#endif /*CONFIG_HOTPLUG_CPU*/
/**
- * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
+ * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU
* @cpu: cpu that just started
*
- * This function calls the cpu_chain notifiers with CPU_STARTING.
* It must be called by the arch code on the new cpu, before the new cpu
* enables interrupts and before the "boot" cpu returns from __cpu_up().
*/
@@ -889,12 +849,10 @@ void notify_cpu_starting(unsigned int cpu)
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
+ rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
while (st->state < target) {
- struct cpuhp_step *step;
-
st->state++;
- step = cpuhp_ap_states + st->state;
- cpuhp_invoke_callback(cpu, st->state, step->startup);
+ cpuhp_invoke_callback(cpu, st->state, true, NULL);
}
}
@@ -979,7 +937,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
* responsible for bringing it up to the target state.
*/
target = min((int)target, CPUHP_BRINGUP_CPU);
- ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target);
+ ret = cpuhp_up_callbacks(cpu, st, target);
out:
cpu_hotplug_done();
return ret;
@@ -1024,12 +982,13 @@ EXPORT_SYMBOL_GPL(cpu_up);
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
-int disable_nonboot_cpus(void)
+int freeze_secondary_cpus(int primary)
{
- int cpu, first_cpu, error = 0;
+ int cpu, error = 0;
cpu_maps_update_begin();
- first_cpu = cpumask_first(cpu_online_mask);
+ if (!cpu_online(primary))
+ primary = cpumask_first(cpu_online_mask);
/*
* We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time
@@ -1038,7 +997,7 @@ int disable_nonboot_cpus(void)
pr_info("Disabling non-boot CPUs ...\n");
for_each_online_cpu(cpu) {
- if (cpu == first_cpu)
+ if (cpu == primary)
continue;
trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
@@ -1081,7 +1040,7 @@ void enable_nonboot_cpus(void)
/* Allow everyone to use the CPU hotplug again */
cpu_maps_update_begin();
- WARN_ON(--cpu_hotplug_disabled < 0);
+ __cpu_hotplug_enable();
if (cpumask_empty(frozen_cpus))
goto out;
@@ -1170,51 +1129,50 @@ core_initcall(cpu_hotplug_pm_sync_init);
static struct cpuhp_step cpuhp_bp_states[] = {
[CPUHP_OFFLINE] = {
.name = "offline",
- .startup = NULL,
- .teardown = NULL,
+ .startup.single = NULL,
+ .teardown.single = NULL,
},
#ifdef CONFIG_SMP
[CPUHP_CREATE_THREADS]= {
- .name = "threads:create",
- .startup = smpboot_create_threads,
- .teardown = NULL,
+ .name = "threads:prepare",
+ .startup.single = smpboot_create_threads,
+ .teardown.single = NULL,
.cant_stop = true,
},
[CPUHP_PERF_PREPARE] = {
- .name = "perf prepare",
- .startup = perf_event_init_cpu,
- .teardown = perf_event_exit_cpu,
+ .name = "perf:prepare",
+ .startup.single = perf_event_init_cpu,
+ .teardown.single = perf_event_exit_cpu,
},
[CPUHP_WORKQUEUE_PREP] = {
- .name = "workqueue prepare",
- .startup = workqueue_prepare_cpu,
- .teardown = NULL,
+ .name = "workqueue:prepare",
+ .startup.single = workqueue_prepare_cpu,
+ .teardown.single = NULL,
},
[CPUHP_HRTIMERS_PREPARE] = {
- .name = "hrtimers prepare",
- .startup = hrtimers_prepare_cpu,
- .teardown = hrtimers_dead_cpu,
+ .name = "hrtimers:prepare",
+ .startup.single = hrtimers_prepare_cpu,
+ .teardown.single = hrtimers_dead_cpu,
},
[CPUHP_SMPCFD_PREPARE] = {
- .name = "SMPCFD prepare",
- .startup = smpcfd_prepare_cpu,
- .teardown = smpcfd_dead_cpu,
+ .name = "smpcfd:prepare",
+ .startup.single = smpcfd_prepare_cpu,
+ .teardown.single = smpcfd_dead_cpu,
},
- [CPUHP_RCUTREE_PREP] = {
- .name = "RCU-tree prepare",
- .startup = rcutree_prepare_cpu,
- .teardown = rcutree_dead_cpu,
+ [CPUHP_RELAY_PREPARE] = {
+ .name = "relay:prepare",
+ .startup.single = relay_prepare_cpu,
+ .teardown.single = NULL,
},
- /*
- * Preparatory and dead notifiers. Will be replaced once the notifiers
- * are converted to states.
- */
- [CPUHP_NOTIFY_PREPARE] = {
- .name = "notify:prepare",
- .startup = notify_prepare,
- .teardown = notify_dead,
- .skip_onerr = true,
- .cant_stop = true,
+ [CPUHP_SLAB_PREPARE] = {
+ .name = "slab:prepare",
+ .startup.single = slab_prepare_cpu,
+ .teardown.single = slab_dead_cpu,
+ },
+ [CPUHP_RCUTREE_PREP] = {
+ .name = "RCU/tree:prepare",
+ .startup.single = rcutree_prepare_cpu,
+ .teardown.single = rcutree_dead_cpu,
},
/*
* On the tear-down path, timers_dead_cpu() must be invoked
@@ -1222,20 +1180,21 @@ static struct cpuhp_step cpuhp_bp_states[] = {
* otherwise a RCU stall occurs.
*/
[CPUHP_TIMERS_DEAD] = {
- .name = "timers dead",
- .startup = NULL,
- .teardown = timers_dead_cpu,
+ .name = "timers:dead",
+ .startup.single = NULL,
+ .teardown.single = timers_dead_cpu,
},
/* Kicks the plugged cpu into life */
[CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup",
- .startup = bringup_cpu,
- .teardown = NULL,
+ .startup.single = bringup_cpu,
+ .teardown.single = NULL,
.cant_stop = true,
},
[CPUHP_AP_SMPCFD_DYING] = {
- .startup = NULL,
- .teardown = smpcfd_dying_cpu,
+ .name = "smpcfd:dying",
+ .startup.single = NULL,
+ .teardown.single = smpcfd_dying_cpu,
},
/*
* Handled on controll processor until the plugged processor manages
@@ -1243,8 +1202,8 @@ static struct cpuhp_step cpuhp_bp_states[] = {
*/
[CPUHP_TEARDOWN_CPU] = {
.name = "cpu:teardown",
- .startup = NULL,
- .teardown = takedown_cpu,
+ .startup.single = NULL,
+ .teardown.single = takedown_cpu,
.cant_stop = true,
},
#else
@@ -1270,24 +1229,13 @@ static struct cpuhp_step cpuhp_ap_states[] = {
/* First state is scheduler control. Interrupts are disabled */
[CPUHP_AP_SCHED_STARTING] = {
.name = "sched:starting",
- .startup = sched_cpu_starting,
- .teardown = sched_cpu_dying,
+ .startup.single = sched_cpu_starting,
+ .teardown.single = sched_cpu_dying,
},
[CPUHP_AP_RCUTREE_DYING] = {
- .startup = NULL,
- .teardown = rcutree_dying_cpu,
- },
- /*
- * Low level startup/teardown notifiers. Run with interrupts
- * disabled. Will be removed once the notifiers are converted to
- * states.
- */
- [CPUHP_AP_NOTIFY_STARTING] = {
- .name = "notify:starting",
- .startup = notify_starting,
- .teardown = notify_dying,
- .skip_onerr = true,
- .cant_stop = true,
+ .name = "RCU/tree:dying",
+ .startup.single = NULL,
+ .teardown.single = rcutree_dying_cpu,
},
/* Entry state on starting. Interrupts enabled from here on. Transient
* state for synchronsization */
@@ -1296,35 +1244,24 @@ static struct cpuhp_step cpuhp_ap_states[] = {
},
/* Handle smpboot threads park/unpark */
[CPUHP_AP_SMPBOOT_THREADS] = {
- .name = "smpboot:threads",
- .startup = smpboot_unpark_threads,
- .teardown = NULL,
+ .name = "smpboot/threads:online",
+ .startup.single = smpboot_unpark_threads,
+ .teardown.single = NULL,
},
[CPUHP_AP_PERF_ONLINE] = {
- .name = "perf online",
- .startup = perf_event_init_cpu,
- .teardown = perf_event_exit_cpu,
+ .name = "perf:online",
+ .startup.single = perf_event_init_cpu,
+ .teardown.single = perf_event_exit_cpu,
},
[CPUHP_AP_WORKQUEUE_ONLINE] = {
- .name = "workqueue online",
- .startup = workqueue_online_cpu,
- .teardown = workqueue_offline_cpu,
+ .name = "workqueue:online",
+ .startup.single = workqueue_online_cpu,
+ .teardown.single = workqueue_offline_cpu,
},
[CPUHP_AP_RCUTREE_ONLINE] = {
- .name = "RCU-tree online",
- .startup = rcutree_online_cpu,
- .teardown = rcutree_offline_cpu,
- },
-
- /*
- * Online/down_prepare notifiers. Will be removed once the notifiers
- * are converted to states.
- */
- [CPUHP_AP_NOTIFY_ONLINE] = {
- .name = "notify:online",
- .startup = notify_online,
- .teardown = notify_down_prepare,
- .skip_onerr = true,
+ .name = "RCU/tree:online",
+ .startup.single = rcutree_online_cpu,
+ .teardown.single = rcutree_offline_cpu,
},
#endif
/*
@@ -1335,16 +1272,16 @@ static struct cpuhp_step cpuhp_ap_states[] = {
/* Last state is scheduler control setting the cpu active */
[CPUHP_AP_ACTIVE] = {
.name = "sched:active",
- .startup = sched_cpu_activate,
- .teardown = sched_cpu_deactivate,
+ .startup.single = sched_cpu_activate,
+ .teardown.single = sched_cpu_deactivate,
},
#endif
/* CPU is fully up and running. */
[CPUHP_ONLINE] = {
.name = "online",
- .startup = NULL,
- .teardown = NULL,
+ .startup.single = NULL,
+ .teardown.single = NULL,
},
};
@@ -1356,54 +1293,86 @@ static int cpuhp_cb_check(enum cpuhp_state state)
return 0;
}
-static bool cpuhp_is_ap_state(enum cpuhp_state state)
+/*
+ * Returns a free for dynamic slot assignment of the Online state. The states
+ * are protected by the cpuhp_slot_states mutex and an empty slot is identified
+ * by having no name assigned.
+ */
+static int cpuhp_reserve_state(enum cpuhp_state state)
{
- /*
- * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
- * purposes as that state is handled explicitely in cpu_down.
- */
- return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
-}
+ enum cpuhp_state i, end;
+ struct cpuhp_step *step;
-static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
-{
- struct cpuhp_step *sp;
+ switch (state) {
+ case CPUHP_AP_ONLINE_DYN:
+ step = cpuhp_ap_states + CPUHP_AP_ONLINE_DYN;
+ end = CPUHP_AP_ONLINE_DYN_END;
+ break;
+ case CPUHP_BP_PREPARE_DYN:
+ step = cpuhp_bp_states + CPUHP_BP_PREPARE_DYN;
+ end = CPUHP_BP_PREPARE_DYN_END;
+ break;
+ default:
+ return -EINVAL;
+ }
- sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
- return sp + state;
+ for (i = state; i <= end; i++, step++) {
+ if (!step->name)
+ return i;
+ }
+ WARN(1, "No more dynamic states available for CPU hotplug\n");
+ return -ENOSPC;
}
-static void cpuhp_store_callbacks(enum cpuhp_state state,
- const char *name,
- int (*startup)(unsigned int cpu),
- int (*teardown)(unsigned int cpu))
+static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
+ int (*startup)(unsigned int cpu),
+ int (*teardown)(unsigned int cpu),
+ bool multi_instance)
{
/* (Un)Install the callbacks for further cpu hotplug operations */
struct cpuhp_step *sp;
+ int ret = 0;
mutex_lock(&cpuhp_state_mutex);
+
+ if (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN) {
+ ret = cpuhp_reserve_state(state);
+ if (ret < 0)
+ goto out;
+ state = ret;
+ }
sp = cpuhp_get_step(state);
- sp->startup = startup;
- sp->teardown = teardown;
+ if (name && sp->name) {
+ ret = -EBUSY;
+ goto out;
+ }
+ sp->startup.single = startup;
+ sp->teardown.single = teardown;
sp->name = name;
+ sp->multi_instance = multi_instance;
+ INIT_HLIST_HEAD(&sp->list);
+out:
mutex_unlock(&cpuhp_state_mutex);
+ return ret;
}
static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
{
- return cpuhp_get_step(state)->teardown;
+ return cpuhp_get_step(state)->teardown.single;
}
/*
* Call the startup/teardown function for a step either on the AP or
* on the current CPU.
*/
-static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
- int (*cb)(unsigned int), bool bringup)
+static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
+ struct hlist_node *node)
{
+ struct cpuhp_step *sp = cpuhp_get_step(state);
int ret;
- if (!cb)
+ if ((bringup && !sp->startup.single) ||
+ (!bringup && !sp->teardown.single))
return 0;
/*
* The non AP bound callbacks can fail on bringup. On teardown
@@ -1411,11 +1380,11 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
*/
#ifdef CONFIG_SMP
if (cpuhp_is_ap_state(state))
- ret = cpuhp_invoke_ap_callback(cpu, state, cb);
+ ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
else
- ret = cpuhp_invoke_callback(cpu, state, cb);
+ ret = cpuhp_invoke_callback(cpu, state, bringup, node);
#else
- ret = cpuhp_invoke_callback(cpu, state, cb);
+ ret = cpuhp_invoke_callback(cpu, state, bringup, node);
#endif
BUG_ON(ret && !bringup);
return ret;
@@ -1427,13 +1396,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
* Note: The teardown callbacks for rollback are not allowed to fail!
*/
static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
- int (*teardown)(unsigned int cpu))
+ struct hlist_node *node)
{
int cpu;
- if (!teardown)
- return;
-
/* Roll back the already executed steps on the other cpus */
for_each_present_cpu(cpu) {
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@ -1444,68 +1410,96 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
/* Did we invoke the startup call on that cpu ? */
if (cpustate >= state)
- cpuhp_issue_call(cpu, state, teardown, false);
+ cpuhp_issue_call(cpu, state, false, node);
}
}
-/*
- * Returns a free for dynamic slot assignment of the Online state. The states
- * are protected by the cpuhp_slot_states mutex and an empty slot is identified
- * by having no name assigned.
- */
-static int cpuhp_reserve_state(enum cpuhp_state state)
+int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
+ bool invoke)
{
- enum cpuhp_state i;
+ struct cpuhp_step *sp;
+ int cpu;
+ int ret;
- mutex_lock(&cpuhp_state_mutex);
- for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) {
- if (cpuhp_ap_states[i].name)
+ sp = cpuhp_get_step(state);
+ if (sp->multi_instance == false)
+ return -EINVAL;
+
+ get_online_cpus();
+
+ if (!invoke || !sp->startup.multi)
+ goto add_node;
+
+ /*
+ * Try to call the startup callback for each present cpu
+ * depending on the hotplug state of the cpu.
+ */
+ for_each_present_cpu(cpu) {
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int cpustate = st->state;
+
+ if (cpustate < state)
continue;
- cpuhp_ap_states[i].name = "Reserved";
- mutex_unlock(&cpuhp_state_mutex);
- return i;
+ ret = cpuhp_issue_call(cpu, state, true, node);
+ if (ret) {
+ if (sp->teardown.multi)
+ cpuhp_rollback_install(cpu, state, node);
+ goto err;
+ }
}
+add_node:
+ ret = 0;
+ mutex_lock(&cpuhp_state_mutex);
+ hlist_add_head(node, &sp->list);
mutex_unlock(&cpuhp_state_mutex);
- WARN(1, "No more dynamic states available for CPU hotplug\n");
- return -ENOSPC;
+
+err:
+ put_online_cpus();
+ return ret;
}
+EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
/**
* __cpuhp_setup_state - Setup the callbacks for an hotplug machine state
- * @state: The state to setup
- * @invoke: If true, the startup function is invoked for cpus where
- * cpu state >= @state
- * @startup: startup callback function
- * @teardown: teardown callback function
+ * @state: The state to setup
+ * @invoke: If true, the startup function is invoked for cpus where
+ * cpu state >= @state
+ * @startup: startup callback function
+ * @teardown: teardown callback function
+ * @multi_instance: State is set up for multiple instances which get
+ * added afterwards.
*
- * Returns 0 if successful, otherwise a proper error code
+ * Returns:
+ * On success:
+ * Positive state number if @state is CPUHP_AP_ONLINE_DYN
+ * 0 for all other states
+ * On failure: proper (negative) error code
*/
int __cpuhp_setup_state(enum cpuhp_state state,
const char *name, bool invoke,
int (*startup)(unsigned int cpu),
- int (*teardown)(unsigned int cpu))
+ int (*teardown)(unsigned int cpu),
+ bool multi_instance)
{
int cpu, ret = 0;
- int dyn_state = 0;
+ bool dynstate;
if (cpuhp_cb_check(state) || !name)
return -EINVAL;
get_online_cpus();
- /* currently assignments for the ONLINE state are possible */
- if (state == CPUHP_AP_ONLINE_DYN) {
- dyn_state = 1;
- ret = cpuhp_reserve_state(state);
- if (ret < 0)
- goto out;
+ ret = cpuhp_store_callbacks(state, name, startup, teardown,
+ multi_instance);
+
+ dynstate = state == CPUHP_AP_ONLINE_DYN;
+ if (ret > 0 && dynstate) {
state = ret;
+ ret = 0;
}
- cpuhp_store_callbacks(state, name, startup, teardown);
-
- if (!invoke || !startup)
+ if (ret || !invoke || !startup)
goto out;
/*
@@ -1519,21 +1513,62 @@ int __cpuhp_setup_state(enum cpuhp_state state,
if (cpustate < state)
continue;
- ret = cpuhp_issue_call(cpu, state, startup, true);
+ ret = cpuhp_issue_call(cpu, state, true, NULL);
if (ret) {
- cpuhp_rollback_install(cpu, state, teardown);
- cpuhp_store_callbacks(state, NULL, NULL, NULL);
+ if (teardown)
+ cpuhp_rollback_install(cpu, state, NULL);
+ cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
goto out;
}
}
out:
put_online_cpus();
- if (!ret && dyn_state)
+ /*
+ * If the requested state is CPUHP_AP_ONLINE_DYN, return the
+ * dynamically allocated state in case of success.
+ */
+ if (!ret && dynstate)
return state;
return ret;
}
EXPORT_SYMBOL(__cpuhp_setup_state);
+int __cpuhp_state_remove_instance(enum cpuhp_state state,
+ struct hlist_node *node, bool invoke)
+{
+ struct cpuhp_step *sp = cpuhp_get_step(state);
+ int cpu;
+
+ BUG_ON(cpuhp_cb_check(state));
+
+ if (!sp->multi_instance)
+ return -EINVAL;
+
+ get_online_cpus();
+ if (!invoke || !cpuhp_get_teardown_cb(state))
+ goto remove;
+ /*
+ * Call the teardown callback for each present cpu depending
+ * on the hotplug state of the cpu. This function is not
+ * allowed to fail currently!
+ */
+ for_each_present_cpu(cpu) {
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int cpustate = st->state;
+
+ if (cpustate >= state)
+ cpuhp_issue_call(cpu, state, false, node);
+ }
+
+remove:
+ mutex_lock(&cpuhp_state_mutex);
+ hlist_del(node);
+ mutex_unlock(&cpuhp_state_mutex);
+ put_online_cpus();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);
/**
* __cpuhp_remove_state - Remove the callbacks for an hotplug machine state
* @state: The state to remove
@@ -1545,14 +1580,21 @@ EXPORT_SYMBOL(__cpuhp_setup_state);
*/
void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
{
- int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state);
+ struct cpuhp_step *sp = cpuhp_get_step(state);
int cpu;
BUG_ON(cpuhp_cb_check(state));
get_online_cpus();
- if (!invoke || !teardown)
+ if (sp->multi_instance) {
+ WARN(!hlist_empty(&sp->list),
+ "Error: Removing state %d which has instances left.\n",
+ state);
+ goto remove;
+ }
+
+ if (!invoke || !cpuhp_get_teardown_cb(state))
goto remove;
/*
@@ -1565,10 +1607,10 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
int cpustate = st->state;
if (cpustate >= state)
- cpuhp_issue_call(cpu, state, teardown, false);
+ cpuhp_issue_call(cpu, state, false, NULL);
}
remove:
- cpuhp_store_callbacks(state, NULL, NULL, NULL);
+ cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
put_online_cpus();
}
EXPORT_SYMBOL(__cpuhp_remove_state);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2b4c20ab5bbe..b3088886cd37 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -55,7 +55,7 @@
#include <linux/backing-dev.h>
#include <linux/sort.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
#include <linux/cgroup.h>
@@ -2715,7 +2715,7 @@ void __cpuset_memory_pressure_bump(void)
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk)
{
- char *buf, *p;
+ char *buf;
struct cgroup_subsys_state *css;
int retval;
@@ -2724,14 +2724,15 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
if (!buf)
goto out;
- retval = -ENAMETOOLONG;
css = task_get_css(tsk, cpuset_cgrp_id);
- p = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
- current->nsproxy->cgroup_ns);
+ retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
css_put(css);
- if (!p)
+ if (retval >= PATH_MAX)
+ retval = -ENAMETOOLONG;
+ if (retval < 0)
goto out_free;
- seq_puts(m, p);
+ seq_puts(m, buf);
seq_putc(m, '\n');
retval = 0;
out_free:
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0874e2edd275..79517e5549f1 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -598,11 +598,11 @@ return_normal:
/*
* Wait for the other CPUs to be notified and be waiting for us:
*/
- time_left = loops_per_jiffy * HZ;
+ time_left = MSEC_PER_SEC;
while (kgdb_do_roundup && --time_left &&
(atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
online_cpus)
- cpu_relax();
+ udelay(1000);
if (!time_left)
pr_crit("Timed out waiting for secondary CPUs.\n");
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index fc1ef736253c..e74be38245ad 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -30,6 +30,7 @@
char kdb_prompt_str[CMD_BUFLEN];
int kdb_trap_printk;
+int kdb_printf_cpu = -1;
static int kgdb_transition_check(char *buffer)
{
@@ -554,31 +555,26 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
int linecount;
int colcount;
int logging, saved_loglevel = 0;
- int saved_trap_printk;
- int got_printf_lock = 0;
int retlen = 0;
int fnd, len;
+ int this_cpu, old_cpu;
char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
char *moreprompt = "more> ";
struct console *c = console_drivers;
- static DEFINE_SPINLOCK(kdb_printf_lock);
unsigned long uninitialized_var(flags);
- preempt_disable();
- saved_trap_printk = kdb_trap_printk;
- kdb_trap_printk = 0;
-
/* Serialize kdb_printf if multiple cpus try to write at once.
* But if any cpu goes recursive in kdb, just print the output,
* even if it is interleaved with any other text.
*/
- if (!KDB_STATE(PRINTF_LOCK)) {
- KDB_STATE_SET(PRINTF_LOCK);
- spin_lock_irqsave(&kdb_printf_lock, flags);
- got_printf_lock = 1;
- atomic_inc(&kdb_event);
- } else {
- __acquire(kdb_printf_lock);
+ local_irq_save(flags);
+ this_cpu = smp_processor_id();
+ for (;;) {
+ old_cpu = cmpxchg(&kdb_printf_cpu, -1, this_cpu);
+ if (old_cpu == -1 || old_cpu == this_cpu)
+ break;
+
+ cpu_relax();
}
diag = kdbgetintenv("LINES", &linecount);
@@ -697,7 +693,7 @@ kdb_printit:
* Write to all consoles.
*/
retlen = strlen(kdb_buffer);
- cp = (char *) printk_skip_level(kdb_buffer);
+ cp = (char *) printk_skip_headers(kdb_buffer);
if (!dbg_kdb_mode && kgdb_connected) {
gdbstub_msg_write(cp, retlen - (cp - kdb_buffer));
} else {
@@ -847,16 +843,9 @@ kdb_print_out:
suspend_grep = 0; /* end of what may have been a recursive call */
if (logging)
console_loglevel = saved_loglevel;
- if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
- got_printf_lock = 0;
- spin_unlock_irqrestore(&kdb_printf_lock, flags);
- KDB_STATE_CLEAR(PRINTF_LOCK);
- atomic_dec(&kdb_event);
- } else {
- __release(kdb_printf_lock);
- }
- kdb_trap_printk = saved_trap_printk;
- preempt_enable();
+ /* kdb_printf_cpu locked the code above. */
+ smp_store_release(&kdb_printf_cpu, old_cpu);
+ local_irq_restore(flags);
return retlen;
}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 2a20c0dfdafc..ca183919d302 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -60,7 +60,6 @@ int kdb_grep_trailing;
* Kernel debugger state flags
*/
int kdb_flags;
-atomic_t kdb_event;
/*
* kdb_lock protects updates to kdb_initial_cpu. Used to
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 75014d7f4568..fc224fbcf954 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -132,7 +132,6 @@ extern int kdb_state;
#define KDB_STATE_PAGER 0x00000400 /* pager is available */
#define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching
* back to initial cpu */
-#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */
#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */
#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */
#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 435c14a45118..660549656991 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -82,19 +82,19 @@ void __delayacct_blkio_end(void)
int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
{
- cputime_t utime, stime, stimescaled, utimescaled;
+ u64 utime, stime, stimescaled, utimescaled;
unsigned long long t2, t3;
unsigned long flags, t1;
s64 tmp;
task_cputime(tsk, &utime, &stime);
tmp = (s64)d->cpu_run_real_total;
- tmp += cputime_to_nsecs(utime + stime);
+ tmp += utime + stime;
d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
task_cputime_scaled(tsk, &utimescaled, &stimescaled);
tmp = (s64)d->cpu_scaled_run_real_total;
- tmp += cputime_to_nsecs(utimescaled + stimescaled);
+ tmp += utimescaled + stimescaled;
d->cpu_scaled_run_real_total =
(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fc9bb2225291..77a932b54a64 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -355,6 +355,8 @@ enum event_type_t {
EVENT_FLEXIBLE = 0x1,
EVENT_PINNED = 0x2,
EVENT_TIME = 0x4,
+ /* see ctx_resched() for details */
+ EVENT_CPU = 0x8,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};
@@ -678,6 +680,8 @@ perf_cgroup_set_timestamp(struct task_struct *task,
info->timestamp = ctx->timestamp;
}
+static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
+
#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
@@ -690,61 +694,46 @@ perf_cgroup_set_timestamp(struct task_struct *task,
static void perf_cgroup_switch(struct task_struct *task, int mode)
{
struct perf_cpu_context *cpuctx;
- struct pmu *pmu;
+ struct list_head *list;
unsigned long flags;
/*
- * disable interrupts to avoid geting nr_cgroup
- * changes via __perf_event_disable(). Also
- * avoids preemption.
+ * Disable interrupts and preemption to avoid this CPU's
+ * cgrp_cpuctx_entry to change under us.
*/
local_irq_save(flags);
- /*
- * we reschedule only in the presence of cgroup
- * constrained events.
- */
-
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->unique_pmu != pmu)
- continue; /* ensure we process each cpuctx once */
+ list = this_cpu_ptr(&cgrp_cpuctx_list);
+ list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
+ WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
- /*
- * perf_cgroup_events says at least one
- * context on this CPU has cgroup events.
- *
- * ctx->nr_cgroups reports the number of cgroup
- * events for a context.
- */
- if (cpuctx->ctx.nr_cgroups > 0) {
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(cpuctx->ctx.pmu);
- if (mode & PERF_CGROUP_SWOUT) {
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
- /*
- * must not be done before ctxswout due
- * to event_filter_match() in event_sched_out()
- */
- cpuctx->cgrp = NULL;
- }
+ if (mode & PERF_CGROUP_SWOUT) {
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ /*
+ * must not be done before ctxswout due
+ * to event_filter_match() in event_sched_out()
+ */
+ cpuctx->cgrp = NULL;
+ }
- if (mode & PERF_CGROUP_SWIN) {
- WARN_ON_ONCE(cpuctx->cgrp);
- /*
- * set cgrp before ctxsw in to allow
- * event_filter_match() to not have to pass
- * task around
- * we pass the cpuctx->ctx to perf_cgroup_from_task()
- * because cgorup events are only per-cpu
- */
- cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
- cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
- }
- perf_pmu_enable(cpuctx->ctx.pmu);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ if (mode & PERF_CGROUP_SWIN) {
+ WARN_ON_ONCE(cpuctx->cgrp);
+ /*
+ * set cgrp before ctxsw in to allow
+ * event_filter_match() to not have to pass
+ * task around
+ * we pass the cpuctx->ctx to perf_cgroup_from_task()
+ * because cgorup events are only per-cpu
+ */
+ cpuctx->cgrp = perf_cgroup_from_task(task,
+ &cpuctx->ctx);
+ cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
}
+ perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
local_irq_restore(flags);
@@ -889,6 +878,7 @@ list_update_cgroup_event(struct perf_event *event,
struct perf_event_context *ctx, bool add)
{
struct perf_cpu_context *cpuctx;
+ struct list_head *cpuctx_entry;
if (!is_cgroup_event(event))
return;
@@ -902,7 +892,16 @@ list_update_cgroup_event(struct perf_event *event,
* this will always be called from the right CPU.
*/
cpuctx = __get_cpu_context(ctx);
- cpuctx->cgrp = add ? event->cgrp : NULL;
+ cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
+ /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
+ if (add) {
+ list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
+ if (perf_cgroup_from_task(current, ctx) == event->cgrp)
+ cpuctx->cgrp = event->cgrp;
+ } else {
+ list_del(cpuctx_entry);
+ cpuctx->cgrp = NULL;
+ }
}
#else /* !CONFIG_CGROUP_PERF */
@@ -1445,6 +1444,20 @@ static void update_group_times(struct perf_event *leader)
update_event_times(event);
}
+static enum event_type_t get_event_type(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ enum event_type_t event_type;
+
+ lockdep_assert_held(&ctx->lock);
+
+ event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
+ if (!ctx->task)
+ event_type |= EVENT_CPU;
+
+ return event_type;
+}
+
static struct list_head *
ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
{
@@ -1461,7 +1474,6 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
-
lockdep_assert_held(&ctx->lock);
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
@@ -1475,8 +1487,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
if (event->group_leader == event) {
struct list_head *list;
- if (is_software_event(event))
- event->group_flags |= PERF_GROUP_SOFTWARE;
+ event->group_caps = event->event_caps;
list = ctx_group_list(event, ctx);
list_add_tail(&event->group_entry, list);
@@ -1617,6 +1628,8 @@ static void perf_group_attach(struct perf_event *event)
{
struct perf_event *group_leader = event->group_leader, *pos;
+ lockdep_assert_held(&event->ctx->lock);
+
/*
* We can have double attach due to group movement in perf_event_open.
*/
@@ -1630,9 +1643,7 @@ static void perf_group_attach(struct perf_event *event)
WARN_ON_ONCE(group_leader->ctx != event->ctx);
- if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
- !is_software_event(event))
- group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
+ group_leader->group_caps &= event->event_caps;
list_add_tail(&event->group_entry, &group_leader->sibling_list);
group_leader->nr_siblings++;
@@ -1692,6 +1703,8 @@ static void perf_group_detach(struct perf_event *event)
struct perf_event *sibling, *tmp;
struct list_head *list = NULL;
+ lockdep_assert_held(&event->ctx->lock);
+
/*
* We can have double detach due to exit/hot-unplug + close.
*/
@@ -1723,7 +1736,7 @@ static void perf_group_detach(struct perf_event *event)
sibling->group_leader = sibling;
/* Inherit group flags from the previous leader */
- sibling->group_flags = event->group_flags;
+ sibling->group_caps = event->group_caps;
WARN_ON_ONCE(sibling->ctx != event->ctx);
}
@@ -1832,6 +1845,8 @@ group_sched_out(struct perf_event *group_event,
struct perf_event *event;
int state = group_event->state;
+ perf_pmu_disable(ctx->pmu);
+
event_sched_out(group_event, cpuctx, ctx);
/*
@@ -1840,6 +1855,8 @@ group_sched_out(struct perf_event *group_event,
list_for_each_entry(event, &group_event->sibling_list, group_entry)
event_sched_out(event, cpuctx, ctx);
+ perf_pmu_enable(ctx->pmu);
+
if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
cpuctx->exclusive = 0;
}
@@ -1886,9 +1903,29 @@ __perf_remove_from_context(struct perf_event *event,
*/
static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
- lockdep_assert_held(&event->ctx->mutex);
+ struct perf_event_context *ctx = event->ctx;
+
+ lockdep_assert_held(&ctx->mutex);
event_function_call(event, __perf_remove_from_context, (void *)flags);
+
+ /*
+ * The above event_function_call() can NO-OP when it hits
+ * TASK_TOMBSTONE. In that case we must already have been detached
+ * from the context (by perf_event_exit_event()) but the grouping
+ * might still be in-tact.
+ */
+ WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
+ if ((flags & DETACH_GROUP) &&
+ (event->attach_state & PERF_ATTACH_GROUP)) {
+ /*
+ * Since in that case we cannot possibly be scheduled, simply
+ * detach now.
+ */
+ raw_spin_lock_irq(&ctx->lock);
+ perf_group_detach(event);
+ raw_spin_unlock_irq(&ctx->lock);
+ }
}
/*
@@ -1959,6 +1996,12 @@ void perf_event_disable(struct perf_event *event)
}
EXPORT_SYMBOL_GPL(perf_event_disable);
+void perf_event_disable_inatomic(struct perf_event *event)
+{
+ event->pending_disable = 1;
+ irq_work_queue(&event->pending);
+}
+
static void perf_set_shadow_time(struct perf_event *event,
struct perf_event_context *ctx,
u64 tstamp)
@@ -2145,7 +2188,7 @@ static int group_can_go_on(struct perf_event *event,
/*
* Groups consisting entirely of software events can always go on.
*/
- if (event->group_flags & PERF_GROUP_SOFTWARE)
+ if (event->group_caps & PERF_EV_CAP_SOFTWARE)
return 1;
/*
* If an exclusive group is already on, no other hardware
@@ -2188,7 +2231,8 @@ ctx_sched_in(struct perf_event_context *ctx,
struct task_struct *task);
static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+ struct perf_event_context *ctx,
+ enum event_type_t event_type)
{
if (!cpuctx->task_ctx)
return;
@@ -2196,7 +2240,7 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
- ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+ ctx_sched_out(ctx, cpuctx, event_type);
}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
@@ -2211,13 +2255,51 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
}
+/*
+ * We want to maintain the following priority of scheduling:
+ * - CPU pinned (EVENT_CPU | EVENT_PINNED)
+ * - task pinned (EVENT_PINNED)
+ * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
+ * - task flexible (EVENT_FLEXIBLE).
+ *
+ * In order to avoid unscheduling and scheduling back in everything every
+ * time an event is added, only do it for the groups of equal priority and
+ * below.
+ *
+ * This can be called after a batch operation on task events, in which case
+ * event_type is a bit mask of the types of events involved. For CPU events,
+ * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
+ */
static void ctx_resched(struct perf_cpu_context *cpuctx,
- struct perf_event_context *task_ctx)
+ struct perf_event_context *task_ctx,
+ enum event_type_t event_type)
{
+ enum event_type_t ctx_event_type = event_type & EVENT_ALL;
+ bool cpu_event = !!(event_type & EVENT_CPU);
+
+ /*
+ * If pinned groups are involved, flexible groups also need to be
+ * scheduled out.
+ */
+ if (event_type & EVENT_PINNED)
+ event_type |= EVENT_FLEXIBLE;
+
perf_pmu_disable(cpuctx->ctx.pmu);
if (task_ctx)
- task_ctx_sched_out(cpuctx, task_ctx);
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ task_ctx_sched_out(cpuctx, task_ctx, event_type);
+
+ /*
+ * Decide which cpu ctx groups to schedule out based on the types
+ * of events that caused rescheduling:
+ * - EVENT_CPU: schedule out corresponding groups;
+ * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
+ * - otherwise, do nothing more.
+ */
+ if (cpu_event)
+ cpu_ctx_sched_out(cpuctx, ctx_event_type);
+ else if (ctx_event_type & EVENT_PINNED)
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+
perf_event_sched_in(cpuctx, task_ctx, current);
perf_pmu_enable(cpuctx->ctx.pmu);
}
@@ -2234,7 +2316,7 @@ static int __perf_install_in_context(void *info)
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
- bool activate = true;
+ bool reprogram = true;
int ret = 0;
raw_spin_lock(&cpuctx->ctx.lock);
@@ -2242,30 +2324,29 @@ static int __perf_install_in_context(void *info)
raw_spin_lock(&ctx->lock);
task_ctx = ctx;
- /* If we're on the wrong CPU, try again */
- if (task_cpu(ctx->task) != smp_processor_id()) {
- ret = -ESRCH;
- goto unlock;
- }
+ reprogram = (ctx->task == current);
/*
- * If we're on the right CPU, see if the task we target is
- * current, if not we don't have to activate the ctx, a future
- * context switch will do that for us.
+ * If the task is running, it must be running on this CPU,
+ * otherwise we cannot reprogram things.
+ *
+ * If its not running, we don't care, ctx->lock will
+ * serialize against it becoming runnable.
*/
- if (ctx->task != current)
- activate = false;
- else
- WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
+ if (task_curr(ctx->task) && !reprogram) {
+ ret = -ESRCH;
+ goto unlock;
+ }
+ WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
} else if (task_ctx) {
raw_spin_lock(&task_ctx->lock);
}
- if (activate) {
+ if (reprogram) {
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
add_event_to_ctx(event, ctx);
- ctx_resched(cpuctx, task_ctx);
+ ctx_resched(cpuctx, task_ctx, get_event_type(event));
} else {
add_event_to_ctx(event, ctx);
}
@@ -2313,13 +2394,36 @@ perf_install_in_context(struct perf_event_context *ctx,
/*
* Installing events is tricky because we cannot rely on ctx->is_active
* to be set in case this is the nr_events 0 -> 1 transition.
+ *
+ * Instead we use task_curr(), which tells us if the task is running.
+ * However, since we use task_curr() outside of rq::lock, we can race
+ * against the actual state. This means the result can be wrong.
+ *
+ * If we get a false positive, we retry, this is harmless.
+ *
+ * If we get a false negative, things are complicated. If we are after
+ * perf_event_context_sched_in() ctx::lock will serialize us, and the
+ * value must be correct. If we're before, it doesn't matter since
+ * perf_event_context_sched_in() will program the counter.
+ *
+ * However, this hinges on the remote context switch having observed
+ * our task->perf_event_ctxp[] store, such that it will in fact take
+ * ctx::lock in perf_event_context_sched_in().
+ *
+ * We do this by task_function_call(), if the IPI fails to hit the task
+ * we know any future context switch of task must see the
+ * perf_event_ctpx[] store.
*/
-again:
+
/*
- * Cannot use task_function_call() because we need to run on the task's
- * CPU regardless of whether its current or not.
+ * This smp_mb() orders the task->perf_event_ctxp[] store with the
+ * task_cpu() load, such that if the IPI then does not find the task
+ * running, a future context switch of that task must observe the
+ * store.
*/
- if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
+ smp_mb();
+again:
+ if (!task_function_call(task, __perf_install_in_context, event))
return;
raw_spin_lock_irq(&ctx->lock);
@@ -2333,12 +2437,16 @@ again:
raw_spin_unlock_irq(&ctx->lock);
return;
}
- raw_spin_unlock_irq(&ctx->lock);
/*
- * Since !ctx->is_active doesn't mean anything, we must IPI
- * unconditionally.
+ * If the task is not running, ctx->lock will avoid it becoming so,
+ * thus we can safely install the event.
*/
- goto again;
+ if (task_curr(task)) {
+ raw_spin_unlock_irq(&ctx->lock);
+ goto again;
+ }
+ add_event_to_ctx(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
}
/*
@@ -2405,7 +2513,7 @@ static void __perf_event_enable(struct perf_event *event,
if (ctx->task)
WARN_ON_ONCE(task_ctx != ctx);
- ctx_resched(cpuctx, task_ctx);
+ ctx_resched(cpuctx, task_ctx, get_event_type(event));
}
/*
@@ -2491,7 +2599,7 @@ static int __perf_event_stop(void *info)
* while restarting.
*/
if (sd->restart)
- event->pmu->start(event, PERF_EF_START);
+ event->pmu->start(event, 0);
return 0;
}
@@ -2832,24 +2940,41 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- task_ctx_sched_out(cpuctx, ctx);
+ task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
raw_spin_unlock(&ctx->lock);
}
}
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
void perf_sched_cb_dec(struct pmu *pmu)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
this_cpu_dec(perf_sched_cb_usages);
+
+ if (!--cpuctx->sched_cb_usage)
+ list_del(&cpuctx->sched_cb_entry);
}
+
void perf_sched_cb_inc(struct pmu *pmu)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ if (!cpuctx->sched_cb_usage++)
+ list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
this_cpu_inc(perf_sched_cb_usages);
}
/*
* This function provides the context switch callback to the lower code
* layer. It is invoked ONLY when the context switch callback is enabled.
+ *
+ * This callback is relevant even to per-cpu events; for example multi event
+ * PEBS requires this to provide PID/TID information. This requires we flush
+ * all queued PEBS records before we context switch to a new task.
*/
static void perf_pmu_sched_task(struct task_struct *prev,
struct task_struct *next,
@@ -2857,34 +2982,24 @@ static void perf_pmu_sched_task(struct task_struct *prev,
{
struct perf_cpu_context *cpuctx;
struct pmu *pmu;
- unsigned long flags;
if (prev == next)
return;
- local_irq_save(flags);
+ list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+ pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
- rcu_read_lock();
-
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- if (pmu->sched_task) {
- cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-
- perf_pmu_disable(pmu);
+ if (WARN_ON_ONCE(!pmu->sched_task))
+ continue;
- pmu->sched_task(cpuctx->task_ctx, sched_in);
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(pmu);
- perf_pmu_enable(pmu);
+ pmu->sched_task(cpuctx->task_ctx, sched_in);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
+ perf_pmu_enable(pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
-
- rcu_read_unlock();
-
- local_irq_restore(flags);
}
static void perf_event_switch(struct task_struct *task,
@@ -3062,8 +3177,12 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
* cpu flexible, task flexible.
+ *
+ * However, if task's ctx is not carrying any pinned
+ * events, no need to flip the cpuctx's events around.
*/
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ if (!list_empty(&ctx->pinned_groups))
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
perf_pmu_enable(ctx->pmu);
perf_ctx_unlock(cpuctx, ctx);
@@ -3378,6 +3497,7 @@ static int event_enable_on_exec(struct perf_event *event,
static void perf_event_enable_on_exec(int ctxn)
{
struct perf_event_context *ctx, *clone_ctx = NULL;
+ enum event_type_t event_type = 0;
struct perf_cpu_context *cpuctx;
struct perf_event *event;
unsigned long flags;
@@ -3391,15 +3511,17 @@ static void perf_event_enable_on_exec(int ctxn)
cpuctx = __get_cpu_context(ctx);
perf_ctx_lock(cpuctx, ctx);
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
- list_for_each_entry(event, &ctx->event_list, event_entry)
+ list_for_each_entry(event, &ctx->event_list, event_entry) {
enabled |= event_enable_on_exec(event, ctx);
+ event_type |= get_event_type(event);
+ }
/*
* Unclone and reschedule this context if we enabled any event.
*/
if (enabled) {
clone_ctx = unclone_ctx(ctx);
- ctx_resched(cpuctx, ctx);
+ ctx_resched(cpuctx, ctx, event_type);
}
perf_ctx_unlock(cpuctx, ctx);
@@ -3416,6 +3538,23 @@ struct perf_read_data {
int ret;
};
+static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
+{
+ u16 local_pkg, event_pkg;
+
+ if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
+ int local_cpu = smp_processor_id();
+
+ event_pkg = topology_physical_package_id(event_cpu);
+ local_pkg = topology_physical_package_id(local_cpu);
+
+ if (event_pkg == local_pkg)
+ return local_cpu;
+ }
+
+ return event_cpu;
+}
+
/*
* Cross CPU call to read the hardware event
*/
@@ -3537,7 +3676,7 @@ u64 perf_event_read_local(struct perf_event *event)
static int perf_event_read(struct perf_event *event, bool group)
{
- int ret = 0;
+ int event_cpu, ret = 0;
/*
* If event is enabled and currently active on a CPU, update the
@@ -3549,17 +3688,26 @@ static int perf_event_read(struct perf_event *event, bool group)
.group = group,
.ret = 0,
};
+
+ event_cpu = READ_ONCE(event->oncpu);
+ if ((unsigned)event_cpu >= nr_cpu_ids)
+ return 0;
+
+ preempt_disable();
+ event_cpu = __perf_event_read_cpu(event, event_cpu);
+
/*
* Purposely ignore the smp_call_function_single() return
* value.
*
- * If event->oncpu isn't a valid CPU it means the event got
+ * If event_cpu isn't a valid CPU it means the event got
* scheduled out and that will have updated the event count.
*
* Therefore, either way, we'll have an up-to-date event count
* after this.
*/
- (void)smp_call_function_single(event->oncpu, __perf_event_read, &data, 1);
+ (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
+ preempt_enable();
ret = data.ret;
} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
@@ -5350,9 +5498,10 @@ perf_output_sample_regs(struct perf_output_handle *handle,
struct pt_regs *regs, u64 mask)
{
int bit;
+ DECLARE_BITMAP(_mask, 64);
- for_each_set_bit(bit, (const unsigned long *) &mask,
- sizeof(mask) * BITS_PER_BYTE) {
+ bitmap_from_u64(_mask, mask);
+ for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
u64 val;
val = perf_reg_value(regs, bit);
@@ -6539,6 +6688,27 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
char *buf = NULL;
char *name;
+ if (vma->vm_flags & VM_READ)
+ prot |= PROT_READ;
+ if (vma->vm_flags & VM_WRITE)
+ prot |= PROT_WRITE;
+ if (vma->vm_flags & VM_EXEC)
+ prot |= PROT_EXEC;
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ flags = MAP_SHARED;
+ else
+ flags = MAP_PRIVATE;
+
+ if (vma->vm_flags & VM_DENYWRITE)
+ flags |= MAP_DENYWRITE;
+ if (vma->vm_flags & VM_MAYEXEC)
+ flags |= MAP_EXECUTABLE;
+ if (vma->vm_flags & VM_LOCKED)
+ flags |= MAP_LOCKED;
+ if (vma->vm_flags & VM_HUGETLB)
+ flags |= MAP_HUGETLB;
+
if (file) {
struct inode *inode;
dev_t dev;
@@ -6565,27 +6735,6 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
maj = MAJOR(dev);
min = MINOR(dev);
- if (vma->vm_flags & VM_READ)
- prot |= PROT_READ;
- if (vma->vm_flags & VM_WRITE)
- prot |= PROT_WRITE;
- if (vma->vm_flags & VM_EXEC)
- prot |= PROT_EXEC;
-
- if (vma->vm_flags & VM_MAYSHARE)
- flags = MAP_SHARED;
- else
- flags = MAP_PRIVATE;
-
- if (vma->vm_flags & VM_DENYWRITE)
- flags |= MAP_DENYWRITE;
- if (vma->vm_flags & VM_MAYEXEC)
- flags |= MAP_EXECUTABLE;
- if (vma->vm_flags & VM_LOCKED)
- flags |= MAP_LOCKED;
- if (vma->vm_flags & VM_HUGETLB)
- flags |= MAP_HUGETLB;
-
goto got_name;
} else {
if (vma->vm_ops && vma->vm_ops->name) {
@@ -6654,7 +6803,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
struct file *file, unsigned long offset,
unsigned long size)
{
- if (filter->inode != file->f_inode)
+ if (filter->inode != file_inode(file))
return false;
if (filter->offset > offset + size)
@@ -6990,25 +7139,12 @@ static void perf_log_itrace_start(struct perf_event *event)
perf_output_end(&handle);
}
-/*
- * Generic event overflow handling, sampling.
- */
-
-static int __perf_event_overflow(struct perf_event *event,
- int throttle, struct perf_sample_data *data,
- struct pt_regs *regs)
+static int
+__perf_event_account_interrupt(struct perf_event *event, int throttle)
{
- int events = atomic_read(&event->event_limit);
struct hw_perf_event *hwc = &event->hw;
- u64 seq;
int ret = 0;
-
- /*
- * Non-sampling counters might still use the PMI to fold short
- * hardware counters, ignore those.
- */
- if (unlikely(!is_sampling_event(event)))
- return 0;
+ u64 seq;
seq = __this_cpu_read(perf_throttled_seq);
if (seq != hwc->interrupts_seq) {
@@ -7036,6 +7172,34 @@ static int __perf_event_overflow(struct perf_event *event,
perf_adjust_period(event, delta, hwc->last_period, true);
}
+ return ret;
+}
+
+int perf_event_account_interrupt(struct perf_event *event)
+{
+ return __perf_event_account_interrupt(event, 1);
+}
+
+/*
+ * Generic event overflow handling, sampling.
+ */
+
+static int __perf_event_overflow(struct perf_event *event,
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ int events = atomic_read(&event->event_limit);
+ int ret = 0;
+
+ /*
+ * Non-sampling counters might still use the PMI to fold short
+ * hardware counters, ignore those.
+ */
+ if (unlikely(!is_sampling_event(event)))
+ return 0;
+
+ ret = __perf_event_account_interrupt(event, throttle);
+
/*
* XXX event_limit might not quite work as expected on inherited
* events
@@ -7045,11 +7209,11 @@ static int __perf_event_overflow(struct perf_event *event,
if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1;
event->pending_kill = POLL_HUP;
- event->pending_disable = 1;
- irq_work_queue(&event->pending);
+
+ perf_event_disable_inatomic(event);
}
- event->overflow_handler(event, data, regs);
+ READ_ONCE(event->overflow_handler)(event, data, regs);
if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1;
@@ -7664,11 +7828,83 @@ static void perf_event_free_filter(struct perf_event *event)
ftrace_profile_free_filter(event);
}
+#ifdef CONFIG_BPF_SYSCALL
+static void bpf_overflow_handler(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct bpf_perf_event_data_kern ctx = {
+ .data = data,
+ .regs = regs,
+ };
+ int ret = 0;
+
+ preempt_disable();
+ if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
+ goto out;
+ rcu_read_lock();
+ ret = BPF_PROG_RUN(event->prog, &ctx);
+ rcu_read_unlock();
+out:
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+ if (!ret)
+ return;
+
+ event->orig_overflow_handler(event, data, regs);
+}
+
+static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
+{
+ struct bpf_prog *prog;
+
+ if (event->overflow_handler_context)
+ /* hw breakpoint or kernel counter */
+ return -EINVAL;
+
+ if (event->prog)
+ return -EEXIST;
+
+ prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ event->prog = prog;
+ event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
+ WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
+ return 0;
+}
+
+static void perf_event_free_bpf_handler(struct perf_event *event)
+{
+ struct bpf_prog *prog = event->prog;
+
+ if (!prog)
+ return;
+
+ WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
+ event->prog = NULL;
+ bpf_prog_put(prog);
+}
+#else
+static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
+{
+ return -EOPNOTSUPP;
+}
+static void perf_event_free_bpf_handler(struct perf_event *event)
+{
+}
+#endif
+
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
bool is_kprobe, is_tracepoint;
struct bpf_prog *prog;
+ if (event->attr.type == PERF_TYPE_HARDWARE ||
+ event->attr.type == PERF_TYPE_SOFTWARE)
+ return perf_event_set_bpf_handler(event, prog_fd);
+
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -EINVAL;
@@ -7709,6 +7945,8 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
{
struct bpf_prog *prog;
+ perf_event_free_bpf_handler(event);
+
if (!event->tp_event)
return;
@@ -7857,6 +8095,9 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
if (task == TASK_TOMBSTONE)
return;
+ if (!ifh->nr_file_filters)
+ return;
+
mm = get_task_mm(event->ctx->task);
if (!mm)
goto restart;
@@ -7908,6 +8149,7 @@ restart:
* if <size> is not specified, the range is treated as a single address.
*/
enum {
+ IF_ACT_NONE = -1,
IF_ACT_FILTER,
IF_ACT_START,
IF_ACT_STOP,
@@ -7931,6 +8173,7 @@ static const match_table_t if_tokens = {
{ IF_SRC_KERNEL, "%u/%u" },
{ IF_SRC_FILEADDR, "%u@%s" },
{ IF_SRC_KERNELADDR, "%u" },
+ { IF_ACT_NONE, NULL },
};
/*
@@ -8025,6 +8268,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
* attribute.
*/
if (state == IF_STATE_END) {
+ ret = -EINVAL;
if (kernel && event->attr.exclude_kernel)
goto fail;
@@ -8032,6 +8276,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
if (!filename)
goto fail;
+ /*
+ * For now, we only support file-based filters
+ * in per-task events; doing so for CPU-wide
+ * events requires additional context switching
+ * trickery, since same object code will be
+ * mapped at different virtual addresses in
+ * different processes.
+ */
+ ret = -EOPNOTSUPP;
+ if (!event->ctx->task)
+ goto fail_free_name;
+
/* look up the path and grab its inode */
ret = kern_path(filename, LOOKUP_FOLLOW, &path);
if (ret)
@@ -8047,6 +8303,8 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
!S_ISREG(filter->inode->i_mode))
/* free_filters_list() will iput() */
goto fail;
+
+ event->addr_filters.nr_file_filters++;
}
/* ready to consume more filters */
@@ -8086,24 +8344,13 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
if (WARN_ON_ONCE(event->parent))
return -EINVAL;
- /*
- * For now, we only support filtering in per-task events; doing so
- * for CPU-wide events requires additional context switching trickery,
- * since same object code will be mapped at different virtual
- * addresses in different processes.
- */
- if (!event->ctx->task)
- return -EOPNOTSUPP;
-
ret = perf_event_parse_addr_filter(event, filter_str, &filters);
if (ret)
- return ret;
+ goto fail_clear_files;
ret = event->pmu->addr_filters_validate(&filters);
- if (ret) {
- free_filters_list(&filters);
- return ret;
- }
+ if (ret)
+ goto fail_free_filters;
/* remove existing filters, if any */
perf_addr_filters_splice(event, &filters);
@@ -8112,6 +8359,14 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
perf_event_for_each_child(event, perf_event_addr_filters_apply);
return ret;
+
+fail_free_filters:
+ free_filters_list(&filters);
+
+fail_clear_files:
+ event->addr_filters.nr_file_filters = 0;
+
+ return ret;
}
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -8463,37 +8718,10 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
return NULL;
}
-static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
-
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-
- if (cpuctx->unique_pmu == old_pmu)
- cpuctx->unique_pmu = pmu;
- }
-}
-
static void free_pmu_context(struct pmu *pmu)
{
- struct pmu *i;
-
mutex_lock(&pmus_lock);
- /*
- * Like a real lame refcount.
- */
- list_for_each_entry(i, &pmus, entry) {
- if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
- update_pmu_context(i, pmu);
- goto out;
- }
- }
-
free_percpu(pmu->pmu_cpu_context);
-out:
mutex_unlock(&pmus_lock);
}
@@ -8697,8 +8925,6 @@ skip_type:
cpuctx->ctx.pmu = pmu;
__perf_mux_hrtimer_init(cpuctx, cpu);
-
- cpuctx->unique_pmu = pmu;
}
got_cpu_context:
@@ -8751,7 +8977,10 @@ EXPORT_SYMBOL_GPL(perf_pmu_register);
void perf_pmu_unregister(struct pmu *pmu)
{
+ int remove_device;
+
mutex_lock(&pmus_lock);
+ remove_device = pmu_bus_running;
list_del_rcu(&pmu->entry);
mutex_unlock(&pmus_lock);
@@ -8765,10 +8994,12 @@ void perf_pmu_unregister(struct pmu *pmu)
free_percpu(pmu->pmu_disable_count);
if (pmu->type >= PERF_TYPE_MAX)
idr_remove(&pmu_idr, pmu->type);
- if (pmu->nr_addr_filters)
- device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
- device_del(pmu->dev);
- put_device(pmu->dev);
+ if (remove_device) {
+ if (pmu->nr_addr_filters)
+ device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+ }
free_pmu_context(pmu);
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
@@ -8811,6 +9042,14 @@ static struct pmu *perf_init_event(struct perf_event *event)
idx = srcu_read_lock(&pmus_srcu);
+ /* Try parent's PMU first: */
+ if (event->parent && event->parent->pmu) {
+ pmu = event->parent->pmu;
+ ret = perf_try_init_event(pmu, event);
+ if (!ret)
+ goto unlock;
+ }
+
rcu_read_lock();
pmu = idr_find(&pmu_idr, event->attr.type);
rcu_read_unlock();
@@ -9025,6 +9264,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (!overflow_handler && parent_event) {
overflow_handler = parent_event->overflow_handler;
context = parent_event->overflow_handler_context;
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
+ if (overflow_handler == bpf_overflow_handler) {
+ struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
+
+ if (IS_ERR(prog)) {
+ err = PTR_ERR(prog);
+ goto err_ns;
+ }
+ event->prog = prog;
+ event->orig_overflow_handler =
+ parent_event->orig_overflow_handler;
+ }
+#endif
}
if (overflow_handler) {
@@ -9365,6 +9617,37 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
return 0;
}
+/*
+ * Variation on perf_event_ctx_lock_nested(), except we take two context
+ * mutexes.
+ */
+static struct perf_event_context *
+__perf_event_ctx_lock_double(struct perf_event *group_leader,
+ struct perf_event_context *ctx)
+{
+ struct perf_event_context *gctx;
+
+again:
+ rcu_read_lock();
+ gctx = READ_ONCE(group_leader->ctx);
+ if (!atomic_inc_not_zero(&gctx->refcount)) {
+ rcu_read_unlock();
+ goto again;
+ }
+ rcu_read_unlock();
+
+ mutex_lock_double(&gctx->mutex, &ctx->mutex);
+
+ if (group_leader->ctx != gctx) {
+ mutex_unlock(&ctx->mutex);
+ mutex_unlock(&gctx->mutex);
+ put_ctx(gctx);
+ goto again;
+ }
+
+ return gctx;
+}
+
/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
@@ -9505,6 +9788,9 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_alloc;
}
+ if (pmu->task_ctx_nr == perf_sw_context)
+ event->event_caps |= PERF_EV_CAP_SOFTWARE;
+
if (group_leader &&
(is_software_event(event) != is_software_event(group_leader))) {
if (is_software_event(event)) {
@@ -9518,7 +9804,7 @@ SYSCALL_DEFINE5(perf_event_open,
*/
pmu = group_leader->pmu;
} else if (is_software_event(group_leader) &&
- (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
+ (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
/*
* In case the group is a pure software group, and we
* try to add a hardware event, move the whole group to
@@ -9605,12 +9891,31 @@ SYSCALL_DEFINE5(perf_event_open,
}
if (move_group) {
- gctx = group_leader->ctx;
- mutex_lock_double(&gctx->mutex, &ctx->mutex);
+ gctx = __perf_event_ctx_lock_double(group_leader, ctx);
+
if (gctx->task == TASK_TOMBSTONE) {
err = -ESRCH;
goto err_locked;
}
+
+ /*
+ * Check if we raced against another sys_perf_event_open() call
+ * moving the software group underneath us.
+ */
+ if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ /*
+ * If someone moved the group out from under us, check
+ * if this new event wound up on the same ctx, if so
+ * its the regular !move_group case, otherwise fail.
+ */
+ if (gctx != ctx) {
+ err = -EINVAL;
+ goto err_locked;
+ } else {
+ perf_event_ctx_unlock(group_leader, gctx);
+ move_group = 0;
+ }
+ }
} else {
mutex_lock(&ctx->mutex);
}
@@ -9712,7 +10017,7 @@ SYSCALL_DEFINE5(perf_event_open,
perf_unpin_context(ctx);
if (move_group)
- mutex_unlock(&gctx->mutex);
+ perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
if (task) {
@@ -9738,7 +10043,7 @@ SYSCALL_DEFINE5(perf_event_open,
err_locked:
if (move_group)
- mutex_unlock(&gctx->mutex);
+ perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
/* err_file: */
fput(event_file);
@@ -10005,7 +10310,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* in.
*/
raw_spin_lock_irq(&child_ctx->lock);
- task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
+ task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
@@ -10453,6 +10758,11 @@ static void __init perf_event_init_all_cpus(void)
INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
+
+#ifdef CONFIG_CGROUP_PERF
+ INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
+#endif
+ INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
}
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 8c50276b60d1..d416f3baf392 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -150,7 +150,7 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
* Returns 0 on success, -EFAULT on failure.
*/
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
- struct page *page, struct page *kpage)
+ struct page *old_page, struct page *new_page)
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
@@ -161,49 +161,49 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
const unsigned long mmun_end = addr + PAGE_SIZE;
struct mem_cgroup *memcg;
- err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
+ err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
false);
if (err)
return err;
/* For try_to_free_swap() and munlock_vma_page() below */
- lock_page(page);
+ lock_page(old_page);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
err = -EAGAIN;
- ptep = page_check_address(page, mm, addr, &ptl, 0);
+ ptep = page_check_address(old_page, mm, addr, &ptl, 0);
if (!ptep) {
- mem_cgroup_cancel_charge(kpage, memcg, false);
+ mem_cgroup_cancel_charge(new_page, memcg, false);
goto unlock;
}
- get_page(kpage);
- page_add_new_anon_rmap(kpage, vma, addr, false);
- mem_cgroup_commit_charge(kpage, memcg, false, false);
- lru_cache_add_active_or_unevictable(kpage, vma);
+ get_page(new_page);
+ page_add_new_anon_rmap(new_page, vma, addr, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
+ lru_cache_add_active_or_unevictable(new_page, vma);
- if (!PageAnon(page)) {
- dec_mm_counter(mm, mm_counter_file(page));
+ if (!PageAnon(old_page)) {
+ dec_mm_counter(mm, mm_counter_file(old_page));
inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush_notify(vma, addr, ptep);
- set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
+ set_pte_at_notify(mm, addr, ptep, mk_pte(new_page, vma->vm_page_prot));
- page_remove_rmap(page, false);
- if (!page_mapped(page))
- try_to_free_swap(page);
+ page_remove_rmap(old_page, false);
+ if (!page_mapped(old_page))
+ try_to_free_swap(old_page);
pte_unmap_unlock(ptep, ptl);
if (vma->vm_flags & VM_LOCKED)
- munlock_vma_page(page);
- put_page(page);
+ munlock_vma_page(old_page);
+ put_page(old_page);
err = 0;
unlock:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- unlock_page(page);
+ unlock_page(old_page);
return err;
}
@@ -300,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
retry:
/* Read the page with vaddr into memory */
- ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
+ ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page,
+ &vma, NULL);
if (ret <= 0)
return ret;
@@ -1193,7 +1194,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
/* Reserve the 1st slot for get_trampoline_vaddr() */
set_bit(0, area->bitmap);
atomic_set(&area->slot_count, 1);
- copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
+ arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
if (!xol_add_vma(mm, area))
return area;
@@ -1710,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
* but we treat this as a 'remote' access since it is
* essentially a kernel access to the memory.
*/
- result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
+ result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
+ NULL, NULL);
if (result < 0)
return result;
diff --git a/kernel/exit.c b/kernel/exit.c
index 091a78be3b09..580da79e38ee 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -14,7 +14,6 @@
#include <linux/tty.h>
#include <linux/iocontext.h>
#include <linux/key.h>
-#include <linux/security.h>
#include <linux/cpu.h>
#include <linux/acct.h>
#include <linux/tsacct_kern.h>
@@ -54,8 +53,10 @@
#include <linux/writeback.h>
#include <linux/shm.h>
#include <linux/kcov.h>
+#include <linux/random.h>
+#include <linux/rcuwait.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
@@ -85,17 +86,16 @@ static void __exit_signal(struct task_struct *tsk)
bool group_dead = thread_group_leader(tsk);
struct sighand_struct *sighand;
struct tty_struct *uninitialized_var(tty);
- cputime_t utime, stime;
+ u64 utime, stime;
sighand = rcu_dereference_check(tsk->sighand,
lockdep_tasklist_lock_is_held());
spin_lock(&sighand->siglock);
+#ifdef CONFIG_POSIX_TIMERS
posix_cpu_timers_exit(tsk);
if (group_dead) {
posix_cpu_timers_exit_group(tsk);
- tty = sig->tty;
- sig->tty = NULL;
} else {
/*
* This can only happen if the caller is de_thread().
@@ -104,7 +104,13 @@ static void __exit_signal(struct task_struct *tsk)
*/
if (unlikely(has_group_leader_pid(tsk)))
posix_cpu_timers_exit_group(tsk);
+ }
+#endif
+ if (group_dead) {
+ tty = sig->tty;
+ sig->tty = NULL;
+ } else {
/*
* If there is any task waiting for the group exit
* then notify it:
@@ -116,6 +122,9 @@ static void __exit_signal(struct task_struct *tsk)
sig->curr_target = next_thread(tsk);
}
+ add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
+ sizeof(unsigned long long));
+
/*
* Accumulate here the counters for all threads as they die. We could
* skip the group leader because it is the last user of signal_struct,
@@ -273,6 +282,35 @@ retry:
return task;
}
+void rcuwait_wake_up(struct rcuwait *w)
+{
+ struct task_struct *task;
+
+ rcu_read_lock();
+
+ /*
+ * Order condition vs @task, such that everything prior to the load
+ * of @task is visible. This is the condition as to why the user called
+ * rcuwait_trywake() in the first place. Pairs with set_current_state()
+ * barrier (A) in rcuwait_wait_event().
+ *
+ * WAIT WAKE
+ * [S] tsk = current [S] cond = true
+ * MB (A) MB (B)
+ * [L] cond [L] tsk
+ */
+ smp_rmb(); /* (B) */
+
+ /*
+ * Avoid using task_rcu_dereference() magic as long as we are careful,
+ * see comment in rcuwait_wait_event() regarding ->exit_state.
+ */
+ task = rcu_dereference(w->task);
+ if (task)
+ wake_up_process(task);
+ rcu_read_unlock();
+}
+
struct task_struct *try_get_task_struct(struct task_struct **ptask)
{
struct task_struct *task;
@@ -459,12 +497,12 @@ assign_new_owner:
* Turn us into a lazy TLB process if we
* aren't already..
*/
-static void exit_mm(struct task_struct *tsk)
+static void exit_mm(void)
{
- struct mm_struct *mm = tsk->mm;
+ struct mm_struct *mm = current->mm;
struct core_state *core_state;
- mm_release(tsk, mm);
+ mm_release(current, mm);
if (!mm)
return;
sync_mm_rss(mm);
@@ -482,7 +520,7 @@ static void exit_mm(struct task_struct *tsk)
up_read(&mm->mmap_sem);
- self.task = tsk;
+ self.task = current;
self.next = xchg(&core_state->dumper.next, &self);
/*
* Implies mb(), the result of xchg() must be visible
@@ -492,26 +530,26 @@ static void exit_mm(struct task_struct *tsk)
complete(&core_state->startup);
for (;;) {
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE);
if (!self.task) /* see coredump_finish() */
break;
freezable_schedule();
}
- __set_task_state(tsk, TASK_RUNNING);
+ __set_current_state(TASK_RUNNING);
down_read(&mm->mmap_sem);
}
atomic_inc(&mm->mm_count);
- BUG_ON(mm != tsk->active_mm);
+ BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
- task_lock(tsk);
- tsk->mm = NULL;
+ task_lock(current);
+ current->mm = NULL;
up_read(&mm->mmap_sem);
enter_lazy_tlb(mm, current);
- task_unlock(tsk);
+ task_unlock(current);
mm_update_next_owner(mm);
mmput(mm);
if (test_thread_flag(TIF_MEMDIE))
- exit_oom_victim(tsk);
+ exit_oom_victim();
}
static struct task_struct *find_alive_thread(struct task_struct *p)
@@ -725,7 +763,7 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {}
#endif
-void do_exit(long code)
+void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
@@ -799,8 +837,10 @@ void do_exit(long code)
acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
+#ifdef CONFIG_POSIX_TIMERS
hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk->signal);
+#endif
if (tsk->mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
}
@@ -812,7 +852,7 @@ void do_exit(long code)
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
- exit_mm(tsk);
+ exit_mm();
if (group_dead)
acct_process();
@@ -836,6 +876,7 @@ void do_exit(long code)
*/
perf_event_exit_task(tsk);
+ sched_autogroup_exit_task(tsk);
cgroup_exit(tsk);
/*
@@ -882,29 +923,7 @@ void do_exit(long code)
exit_rcu();
TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
- /*
- * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
- * when the following two conditions become true.
- * - There is race condition of mmap_sem (It is acquired by
- * exit_mm()), and
- * - SMI occurs before setting TASK_RUNINNG.
- * (or hypervisor of virtual machine switches to other guest)
- * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
- *
- * To avoid it, we have to wait for releasing tsk->pi_lock which
- * is held by try_to_wake_up()
- */
- smp_mb();
- raw_spin_unlock_wait(&tsk->pi_lock);
-
- /* causes final put_task_struct in finish_task_switch(). */
- tsk->state = TASK_DEAD;
- tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
- schedule();
- BUG();
- /* Avoid "noreturn function does return". */
- for (;;)
- cpu_relax(); /* For when BUG is null */
+ do_task_dead();
}
EXPORT_SYMBOL_GPL(do_exit);
@@ -1101,7 +1120,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
struct signal_struct *sig = p->signal;
struct signal_struct *psig = current->signal;
unsigned long maxrss;
- cputime_t tgutime, tgstime;
+ u64 tgutime, tgstime;
/*
* The resource counters for the group leader are in its
@@ -1370,7 +1389,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue;
* then ->notask_error is 0 if @p is an eligible child,
- * or another error from security_task_wait(), or still -ECHILD.
+ * or still -ECHILD.
*/
static int wait_consider_task(struct wait_opts *wo, int ptrace,
struct task_struct *p)
@@ -1390,20 +1409,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
if (!ret)
return ret;
- ret = security_task_wait(p);
- if (unlikely(ret < 0)) {
- /*
- * If we have not yet seen any eligible child,
- * then let this error code replace -ECHILD.
- * A permission error will give the user a clue
- * to look for security policy problems, rather
- * than for mysterious wait bugs.
- */
- if (wo->notask_error)
- wo->notask_error = ret;
- return 0;
- }
-
if (unlikely(exit_state == EXIT_TRACE)) {
/*
* ptrace == 0 means we are the natural parent. In this case
@@ -1496,7 +1501,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue; then
* ->notask_error is 0 if there were any eligible children,
- * or another error from security_task_wait(), or still -ECHILD.
+ * or still -ECHILD.
*/
static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
{
diff --git a/kernel/extable.c b/kernel/extable.c
index e820ccee9846..e1359474baa5 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -20,9 +20,10 @@
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/init.h>
+#include <linux/kprobes.h>
#include <asm/sections.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
* mutex protecting text section modification (dynamic code patching).
@@ -104,6 +105,8 @@ int __kernel_text_address(unsigned long addr)
return 1;
if (is_ftrace_trampoline(addr))
return 1;
+ if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
+ return 1;
/*
* There might be init symbols in saved stacktraces.
* Give those symbols a chance to be printed in
@@ -123,7 +126,11 @@ int kernel_text_address(unsigned long addr)
return 1;
if (is_module_text_address(addr))
return 1;
- return is_ftrace_trampoline(addr);
+ if (is_ftrace_trampoline(addr))
+ return 1;
+ if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
+ return 1;
+ return 0;
}
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index beb31725f7e2..ff82e24573b6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -79,7 +79,7 @@
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -158,19 +158,83 @@ void __weak arch_release_thread_stack(unsigned long *stack)
* Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
* kmemcache based allocator.
*/
-# if THREAD_SIZE >= PAGE_SIZE
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
- int node)
+# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
+
+#ifdef CONFIG_VMAP_STACK
+/*
+ * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
+ * flush. Try to minimize the number of calls by caching stacks.
+ */
+#define NR_CACHED_STACKS 2
+static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+#endif
+
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
{
+#ifdef CONFIG_VMAP_STACK
+ void *stack;
+ int i;
+
+ local_irq_disable();
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ struct vm_struct *s = this_cpu_read(cached_stacks[i]);
+
+ if (!s)
+ continue;
+ this_cpu_write(cached_stacks[i], NULL);
+
+ tsk->stack_vm_area = s;
+ local_irq_enable();
+ return s->addr;
+ }
+ local_irq_enable();
+
+ stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
+ VMALLOC_START, VMALLOC_END,
+ THREADINFO_GFP | __GFP_HIGHMEM,
+ PAGE_KERNEL,
+ 0, node, __builtin_return_address(0));
+
+ /*
+ * We can't call find_vm_area() in interrupt context, and
+ * free_thread_stack() can be called in interrupt context,
+ * so cache the vm_struct.
+ */
+ if (stack)
+ tsk->stack_vm_area = find_vm_area(stack);
+ return stack;
+#else
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);
return page ? page_address(page) : NULL;
+#endif
}
-static inline void free_thread_stack(unsigned long *stack)
+static inline void free_thread_stack(struct task_struct *tsk)
{
- __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
+#ifdef CONFIG_VMAP_STACK
+ if (task_stack_vm_area(tsk)) {
+ unsigned long flags;
+ int i;
+
+ local_irq_save(flags);
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ if (this_cpu_read(cached_stacks[i]))
+ continue;
+
+ this_cpu_write(cached_stacks[i], tsk->stack_vm_area);
+ local_irq_restore(flags);
+ return;
+ }
+ local_irq_restore(flags);
+
+ vfree_atomic(tsk->stack);
+ return;
+ }
+#endif
+
+ __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
}
# else
static struct kmem_cache *thread_stack_cache;
@@ -181,9 +245,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
}
-static void free_thread_stack(unsigned long *stack)
+static void free_thread_stack(struct task_struct *tsk)
{
- kmem_cache_free(thread_stack_cache, stack);
+ kmem_cache_free(thread_stack_cache, tsk->stack);
}
void thread_stack_cache_init(void)
@@ -213,28 +277,85 @@ struct kmem_cache *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
-static void account_kernel_stack(unsigned long *stack, int account)
+static void account_kernel_stack(struct task_struct *tsk, int account)
{
- /* All stack pages are in the same zone and belong to the same memcg. */
- struct page *first_page = virt_to_page(stack);
+ void *stack = task_stack_page(tsk);
+ struct vm_struct *vm = task_stack_vm_area(tsk);
+
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+
+ if (vm) {
+ int i;
- mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
- THREAD_SIZE / 1024 * account);
+ BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ mod_zone_page_state(page_zone(vm->pages[i]),
+ NR_KERNEL_STACK_KB,
+ PAGE_SIZE / 1024 * account);
+ }
- memcg_kmem_update_page_stat(
- first_page, MEMCG_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
+ /* All stack pages belong to the same memcg. */
+ memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
+ } else {
+ /*
+ * All stack pages are in the same zone and belong to the
+ * same memcg.
+ */
+ struct page *first_page = virt_to_page(stack);
+
+ mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+ THREAD_SIZE / 1024 * account);
+
+ memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
+ }
}
-void free_task(struct task_struct *tsk)
+static void release_task_stack(struct task_struct *tsk)
{
- account_kernel_stack(tsk->stack, -1);
+ if (WARN_ON(tsk->state != TASK_DEAD))
+ return; /* Better to leak the stack than to free prematurely */
+
+ account_kernel_stack(tsk, -1);
arch_release_thread_stack(tsk->stack);
- free_thread_stack(tsk->stack);
+ free_thread_stack(tsk);
+ tsk->stack = NULL;
+#ifdef CONFIG_VMAP_STACK
+ tsk->stack_vm_area = NULL;
+#endif
+}
+
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+void put_task_stack(struct task_struct *tsk)
+{
+ if (atomic_dec_and_test(&tsk->stack_refcount))
+ release_task_stack(tsk);
+}
+#endif
+
+void free_task(struct task_struct *tsk)
+{
+#ifndef CONFIG_THREAD_INFO_IN_TASK
+ /*
+ * The task is finally done with both the stack and thread_info,
+ * so free both.
+ */
+ release_task_stack(tsk);
+#else
+ /*
+ * If the task had a separate stack allocation, it should be gone
+ * by now.
+ */
+ WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
+#endif
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
put_seccomp_filter(tsk);
arch_release_task_struct(tsk);
+ if (tsk->flags & PF_KTHREAD)
+ free_kthread_struct(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -243,6 +364,12 @@ static inline void free_signal_struct(struct signal_struct *sig)
{
taskstats_tgid_free(sig);
sched_autogroup_exit(sig);
+ /*
+ * __mmdrop is not safe to call from softirq context on x86 due to
+ * pgd_dtor so postpone it to the async context
+ */
+ if (sig->oom_mm)
+ mmdrop_async(sig->oom_mm);
kmem_cache_free(signal_cachep, sig);
}
@@ -302,13 +429,16 @@ int arch_task_struct_size __read_mostly;
void __init fork_init(void)
{
+ int i;
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
-#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
+#define ARCH_MIN_TASKALIGN 0
#endif
+ int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
+
/* create a slab on which task_structs can be allocated */
task_struct_cachep = kmem_cache_create("task_struct",
- arch_task_struct_size, ARCH_MIN_TASKALIGN,
+ arch_task_struct_size, align,
SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
#endif
@@ -321,6 +451,10 @@ void __init fork_init(void)
init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
init_task.signal->rlim[RLIMIT_SIGPENDING] =
init_task.signal->rlim[RLIMIT_NPROC];
+
+ for (i = 0; i < UCOUNT_COUNTS; i++) {
+ init_user_ns.ucount_max[i] = max_threads/2;
+ }
}
int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -342,6 +476,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
unsigned long *stack;
+ struct vm_struct *stack_vm_area;
int err;
if (node == NUMA_NO_NODE)
@@ -354,11 +489,26 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (!stack)
goto free_tsk;
+ stack_vm_area = task_stack_vm_area(tsk);
+
err = arch_dup_task_struct(tsk, orig);
+
+ /*
+ * arch_dup_task_struct() clobbers the stack-related fields. Make
+ * sure they're properly initialized before using any stack-related
+ * functions again.
+ */
+ tsk->stack = stack;
+#ifdef CONFIG_VMAP_STACK
+ tsk->stack_vm_area = stack_vm_area;
+#endif
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ atomic_set(&tsk->stack_refcount, 1);
+#endif
+
if (err)
goto free_stack;
- tsk->stack = stack;
#ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under
@@ -390,21 +540,22 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
- account_kernel_stack(stack, 1);
+ account_kernel_stack(tsk, 1);
kcov_task_init(tsk);
return tsk;
free_stack:
- free_thread_stack(stack);
+ free_thread_stack(tsk);
free_tsk:
free_task_struct(tsk);
return NULL;
}
#ifdef CONFIG_MMU
-static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+static __latent_entropy int dup_mmap(struct mm_struct *mm,
+ struct mm_struct *oldmm)
{
struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
struct rb_node **rb_link, *rb_parent;
@@ -598,7 +749,8 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
-static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
+static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+ struct user_namespace *user_ns)
{
mm->mmap = NULL;
mm->mm_rb = RB_ROOT;
@@ -638,6 +790,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
if (init_new_context(p, mm))
goto fail_nocontext;
+ mm->user_ns = get_user_ns(user_ns);
return mm;
fail_nocontext:
@@ -683,7 +836,7 @@ struct mm_struct *mm_alloc(void)
return NULL;
memset(mm, 0, sizeof(*mm));
- return mm_init(mm, current);
+ return mm_init(mm, current, current_user_ns());
}
/*
@@ -698,6 +851,7 @@ void __mmdrop(struct mm_struct *mm)
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
check_mm(mm);
+ put_user_ns(mm->user_ns);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -711,6 +865,7 @@ static inline void __mmput(struct mm_struct *mm)
ksm_exit(mm);
khugepaged_exit(mm); /* must run before exit_mmap */
exit_mmap(mm);
+ mm_put_huge_zero_page(mm);
set_mm_exe_file(mm, NULL);
if (!list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
@@ -719,6 +874,7 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
+ set_bit(MMF_OOM_SKIP, &mm->flags);
mmdrop(mm);
}
@@ -977,7 +1133,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
memcpy(mm, oldmm, sizeof(*mm));
- if (!mm_init(mm, tsk))
+ if (!mm_init(mm, tsk, mm->user_ns))
goto fail_nomem;
err = dup_mmap(mm, oldmm);
@@ -1150,6 +1306,7 @@ void __cleanup_sighand(struct sighand_struct *sighand)
}
}
+#ifdef CONFIG_POSIX_TIMERS
/*
* Initialize POSIX timer handling for a thread group.
*/
@@ -1159,7 +1316,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (cpu_limit != RLIM_INFINITY) {
- sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
+ sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC;
sig->cputimer.running = true;
}
@@ -1168,6 +1325,9 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
INIT_LIST_HEAD(&sig->cpu_timers[1]);
INIT_LIST_HEAD(&sig->cpu_timers[2]);
}
+#else
+static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
+#endif
static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
{
@@ -1192,12 +1352,14 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
init_waitqueue_head(&sig->wait_chldexit);
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
- INIT_LIST_HEAD(&sig->posix_timers);
seqlock_init(&sig->stats_lock);
prev_cputime_init(&sig->prev_cputime);
+#ifdef CONFIG_POSIX_TIMERS
+ INIT_LIST_HEAD(&sig->posix_timers);
hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
sig->real_timer.function = it_real_fn;
+#endif
task_lock(current->group_leader);
memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
@@ -1269,6 +1431,7 @@ static void rt_mutex_init_task(struct task_struct *p)
#endif
}
+#ifdef CONFIG_POSIX_TIMERS
/*
* Initialize POSIX timer handling for a single task.
*/
@@ -1281,6 +1444,9 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
INIT_LIST_HEAD(&tsk->cpu_timers[1]);
INIT_LIST_HEAD(&tsk->cpu_timers[2]);
}
+#else
+static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
+#endif
static inline void
init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
@@ -1296,7 +1462,8 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
-static struct task_struct *copy_process(unsigned long clone_flags,
+static __latent_entropy struct task_struct *copy_process(
+ unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *child_tidptr,
@@ -1390,7 +1557,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_count;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
- p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
+ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
@@ -1401,7 +1568,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
init_sigpending(&p->pending);
p->utime = p->stime = p->gtime = 0;
+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
p->utimescaled = p->stimescaled = 0;
+#endif
prev_cputime_init(&p->prev_cputime);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@@ -1715,6 +1884,8 @@ bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
exit_creds(p);
bad_fork_free:
+ p->state = TASK_DEAD;
+ put_task_stack(p);
free_task(p);
fork_out:
return ERR_PTR(retval);
@@ -1780,6 +1951,7 @@ long _do_fork(unsigned long clone_flags,
p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
+ add_latent_entropy();
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
diff --git a/kernel/futex.c b/kernel/futex.c
index 46cb3a301bc1..cdf365036141 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -381,8 +381,12 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
#endif
}
-/*
- * We hash on the keys returned from get_futex_key (see below).
+/**
+ * hash_futex - Return the hash bucket in the global hash
+ * @key: Pointer to the futex key for which the hash is calculated
+ *
+ * We hash on the keys returned from get_futex_key (see below) and return the
+ * corresponding hash bucket in the global hash.
*/
static struct futex_hash_bucket *hash_futex(union futex_key *key)
{
@@ -392,7 +396,12 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
return &futex_queues[hash & (futex_hashsize - 1)];
}
-/*
+
+/**
+ * match_futex - Check whether two futex keys are equal
+ * @key1: Pointer to key1
+ * @key2: Pointer to key2
+ *
* Return 1 if two futex_keys are equal, 0 otherwise.
*/
static inline int match_futex(union futex_key *key1, union futex_key *key2)
@@ -1289,7 +1298,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
struct task_struct *new_owner;
struct futex_pi_state *pi_state = this->pi_state;
u32 uninitialized_var(curval), newval;
- WAKE_Q(wake_q);
+ DEFINE_WAKE_Q(wake_q);
bool deboost;
int ret = 0;
@@ -1406,7 +1415,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
struct futex_q *this, *next;
union futex_key key = FUTEX_KEY_INIT;
int ret;
- WAKE_Q(wake_q);
+ DEFINE_WAKE_Q(wake_q);
if (!bitset)
return -EINVAL;
@@ -1460,7 +1469,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
int ret, op_ret;
- WAKE_Q(wake_q);
+ DEFINE_WAKE_Q(wake_q);
retry:
ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
@@ -1699,7 +1708,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
struct futex_pi_state *pi_state = NULL;
struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
- WAKE_Q(wake_q);
+ DEFINE_WAKE_Q(wake_q);
if (requeue_pi) {
/*
@@ -2450,7 +2459,7 @@ retry:
restart->fn = futex_wait_restart;
restart->futex.uaddr = uaddr;
restart->futex.val = val;
- restart->futex.time = abs_time->tv64;
+ restart->futex.time = *abs_time;
restart->futex.bitset = bitset;
restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
@@ -2471,7 +2480,7 @@ static long futex_wait_restart(struct restart_block *restart)
ktime_t t, *tp = NULL;
if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
- t.tv64 = restart->futex.time;
+ t = restart->futex.time;
tp = &t;
}
restart->fn = do_no_restart_syscall;
@@ -3314,4 +3323,4 @@ static int __init futex_init(void)
return 0;
}
-__initcall(futex_init);
+core_initcall(futex_init);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 4ae3232e7a28..3f409968e466 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -13,7 +13,7 @@
#include <linux/ptrace.h>
#include <linux/syscalls.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
diff --git a/kernel/groups.c b/kernel/groups.c
index 74d431d25251..8dd7a61b7115 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -7,55 +7,31 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/user_namespace.h>
-#include <asm/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
struct group_info *groups_alloc(int gidsetsize)
{
- struct group_info *group_info;
- int nblocks;
- int i;
-
- nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
- /* Make sure we always allocate at least one indirect block pointer */
- nblocks = nblocks ? : 1;
- group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
- if (!group_info)
+ struct group_info *gi;
+ unsigned int len;
+
+ len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize;
+ gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY);
+ if (!gi)
+ gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL);
+ if (!gi)
return NULL;
- group_info->ngroups = gidsetsize;
- group_info->nblocks = nblocks;
- atomic_set(&group_info->usage, 1);
-
- if (gidsetsize <= NGROUPS_SMALL)
- group_info->blocks[0] = group_info->small_block;
- else {
- for (i = 0; i < nblocks; i++) {
- kgid_t *b;
- b = (void *)__get_free_page(GFP_USER);
- if (!b)
- goto out_undo_partial_alloc;
- group_info->blocks[i] = b;
- }
- }
- return group_info;
-out_undo_partial_alloc:
- while (--i >= 0) {
- free_page((unsigned long)group_info->blocks[i]);
- }
- kfree(group_info);
- return NULL;
+ atomic_set(&gi->usage, 1);
+ gi->ngroups = gidsetsize;
+ return gi;
}
EXPORT_SYMBOL(groups_alloc);
void groups_free(struct group_info *group_info)
{
- if (group_info->blocks[0] != group_info->small_block) {
- int i;
- for (i = 0; i < group_info->nblocks; i++)
- free_page((unsigned long)group_info->blocks[i]);
- }
- kfree(group_info);
+ kvfree(group_info);
}
EXPORT_SYMBOL(groups_free);
@@ -70,7 +46,7 @@ static int groups_to_user(gid_t __user *grouplist,
for (i = 0; i < count; i++) {
gid_t gid;
- gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i));
+ gid = from_kgid_munged(user_ns, group_info->gid[i]);
if (put_user(gid, grouplist+i))
return -EFAULT;
}
@@ -95,7 +71,7 @@ static int groups_from_user(struct group_info *group_info,
if (!gid_valid(kgid))
return -EINVAL;
- GROUP_AT(group_info, i) = kgid;
+ group_info->gid[i] = kgid;
}
return 0;
}
@@ -115,15 +91,14 @@ static void groups_sort(struct group_info *group_info)
for (base = 0; base < max; base++) {
int left = base;
int right = left + stride;
- kgid_t tmp = GROUP_AT(group_info, right);
+ kgid_t tmp = group_info->gid[right];
- while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) {
- GROUP_AT(group_info, right) =
- GROUP_AT(group_info, left);
+ while (left >= 0 && gid_gt(group_info->gid[left], tmp)) {
+ group_info->gid[right] = group_info->gid[left];
right = left;
left -= stride;
}
- GROUP_AT(group_info, right) = tmp;
+ group_info->gid[right] = tmp;
}
stride /= 3;
}
@@ -141,9 +116,9 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
right = group_info->ngroups;
while (left < right) {
unsigned int mid = (left+right)/2;
- if (gid_gt(grp, GROUP_AT(group_info, mid)))
+ if (gid_gt(grp, group_info->gid[mid]))
left = mid + 1;
- else if (gid_lt(grp, GROUP_AT(group_info, mid)))
+ else if (gid_lt(grp, group_info->gid[mid]))
right = mid;
else
return 1;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index d234022805dc..40c07e4fa116 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -98,26 +98,27 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
trace_sched_process_hang(t);
- if (!sysctl_hung_task_warnings)
+ if (!sysctl_hung_task_warnings && !sysctl_hung_task_panic)
return;
- if (sysctl_hung_task_warnings > 0)
- sysctl_hung_task_warnings--;
-
/*
* Ok, the task did not get scheduled for more than 2 minutes,
* complain:
*/
- pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
- t->comm, t->pid, timeout);
- pr_err(" %s %s %.*s\n",
- print_tainted(), init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
- pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
- " disables this message.\n");
- sched_show_task(t);
- debug_show_held_locks(t);
+ if (sysctl_hung_task_warnings) {
+ if (sysctl_hung_task_warnings > 0)
+ sysctl_hung_task_warnings--;
+ pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
+ t->comm, t->pid, timeout);
+ pr_err(" %s %s %.*s\n",
+ print_tainted(), init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+ pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+ " disables this message.\n");
+ sched_show_task(t);
+ debug_show_all_locks();
+ }
touch_nmi_watchdog();
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 32f6cfcff212..4544b115f5eb 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -4,60 +4,153 @@
#include <linux/slab.h>
#include <linux/cpu.h>
-static int get_first_sibling(unsigned int cpu)
+static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
+ int cpus_per_vec)
{
- unsigned int ret;
+ const struct cpumask *siblmsk;
+ int cpu, sibl;
- ret = cpumask_first(topology_sibling_cpumask(cpu));
- if (ret < nr_cpu_ids)
- return ret;
- return cpu;
+ for ( ; cpus_per_vec > 0; ) {
+ cpu = cpumask_first(nmsk);
+
+ /* Should not happen, but I'm too lazy to think about it */
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ cpumask_clear_cpu(cpu, nmsk);
+ cpumask_set_cpu(cpu, irqmsk);
+ cpus_per_vec--;
+
+ /* If the cpu has siblings, use them first */
+ siblmsk = topology_sibling_cpumask(cpu);
+ for (sibl = -1; cpus_per_vec > 0; ) {
+ sibl = cpumask_next(sibl, siblmsk);
+ if (sibl >= nr_cpu_ids)
+ break;
+ if (!cpumask_test_and_clear_cpu(sibl, nmsk))
+ continue;
+ cpumask_set_cpu(sibl, irqmsk);
+ cpus_per_vec--;
+ }
+ }
+}
+
+static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk)
+{
+ int n, nodes = 0;
+
+ /* Calculate the number of nodes in the supplied affinity mask */
+ for_each_online_node(n) {
+ if (cpumask_intersects(mask, cpumask_of_node(n))) {
+ node_set(n, *nodemsk);
+ nodes++;
+ }
+ }
+ return nodes;
}
-/*
- * Take a map of online CPUs and the number of available interrupt vectors
- * and generate an output cpumask suitable for spreading MSI/MSI-X vectors
- * so that they are distributed as good as possible around the CPUs. If
- * more vectors than CPUs are available we'll map one to each CPU,
- * otherwise we map one to the first sibling of each socket.
+/**
+ * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
+ * @nvecs: The total number of vectors
+ * @affd: Description of the affinity requirements
*
- * If there are more vectors than CPUs we will still only have one bit
- * set per CPU, but interrupt code will keep on assigning the vectors from
- * the start of the bitmap until we run out of vectors.
+ * Returns the masks pointer or NULL if allocation failed.
*/
-struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
+struct cpumask *
+irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
{
- struct cpumask *affinity_mask;
- unsigned int max_vecs = *nr_vecs;
+ int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec;
+ int affv = nvecs - affd->pre_vectors - affd->post_vectors;
+ int last_affv = affv + affd->pre_vectors;
+ nodemask_t nodemsk = NODE_MASK_NONE;
+ struct cpumask *masks;
+ cpumask_var_t nmsk;
- if (max_vecs == 1)
+ if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
return NULL;
- affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL);
- if (!affinity_mask) {
- *nr_vecs = 1;
- return NULL;
- }
+ masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
+ if (!masks)
+ goto out;
+ /* Fill out vectors at the beginning that don't need affinity */
+ for (curvec = 0; curvec < affd->pre_vectors; curvec++)
+ cpumask_copy(masks + curvec, irq_default_affinity);
+
+ /* Stabilize the cpumasks */
get_online_cpus();
- if (max_vecs >= num_online_cpus()) {
- cpumask_copy(affinity_mask, cpu_online_mask);
- *nr_vecs = num_online_cpus();
- } else {
- unsigned int vecs = 0, cpu;
-
- for_each_online_cpu(cpu) {
- if (cpu == get_first_sibling(cpu)) {
- cpumask_set_cpu(cpu, affinity_mask);
- vecs++;
- }
+ nodes = get_nodes_in_cpumask(cpu_online_mask, &nodemsk);
- if (--max_vecs == 0)
+ /*
+ * If the number of nodes in the mask is greater than or equal the
+ * number of vectors we just spread the vectors across the nodes.
+ */
+ if (affv <= nodes) {
+ for_each_node_mask(n, nodemsk) {
+ cpumask_copy(masks + curvec, cpumask_of_node(n));
+ if (++curvec == last_affv)
break;
}
- *nr_vecs = vecs;
+ goto done;
}
+
+ /* Spread the vectors per node */
+ vecs_per_node = affv / nodes;
+ /* Account for rounding errors */
+ extra_vecs = affv - (nodes * vecs_per_node);
+
+ for_each_node_mask(n, nodemsk) {
+ int ncpus, v, vecs_to_assign = vecs_per_node;
+
+ /* Get the cpus on this node which are in the mask */
+ cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n));
+
+ /* Calculate the number of cpus per vector */
+ ncpus = cpumask_weight(nmsk);
+
+ for (v = 0; curvec < last_affv && v < vecs_to_assign;
+ curvec++, v++) {
+ cpus_per_vec = ncpus / vecs_to_assign;
+
+ /* Account for extra vectors to compensate rounding errors */
+ if (extra_vecs) {
+ cpus_per_vec++;
+ if (!--extra_vecs)
+ vecs_per_node++;
+ }
+ irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
+ }
+
+ if (curvec >= last_affv)
+ break;
+ }
+
+done:
+ put_online_cpus();
+
+ /* Fill out vectors at the end that don't need affinity */
+ for (; curvec < nvecs; curvec++)
+ cpumask_copy(masks + curvec, irq_default_affinity);
+out:
+ free_cpumask_var(nmsk);
+ return masks;
+}
+
+/**
+ * irq_calc_affinity_vectors - Calculate the optimal number of vectors
+ * @maxvec: The maximum number of vectors available
+ * @affd: Description of the affinity requirements
+ */
+int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd)
+{
+ int resv = affd->pre_vectors + affd->post_vectors;
+ int vecs = maxvec - resv;
+ int cpus;
+
+ /* Stabilize the cpumasks */
+ get_online_cpus();
+ cpus = cpumask_weight(cpu_online_mask);
put_online_cpus();
- return affinity_mask;
+ return min(cpus, vecs) + resv;
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 26ba5654d9d5..be3c34e4f2ac 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -76,7 +76,6 @@ int irq_set_irq_type(unsigned int irq, unsigned int type)
if (!desc)
return -EINVAL;
- type &= IRQ_TYPE_SENSE_MASK;
ret = __irq_set_trigger(desc, type);
irq_put_desc_busunlock(desc, flags);
return ret;
@@ -756,7 +755,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
struct irqaction *action = desc->action;
- void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
unsigned int irq = irq_desc_get_irq(desc);
irqreturn_t res;
@@ -765,15 +763,26 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
- trace_irq_handler_entry(irq, action);
- res = action->handler(irq, dev_id);
- trace_irq_handler_exit(irq, action, res);
+ if (likely(action)) {
+ trace_irq_handler_entry(irq, action);
+ res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
+ trace_irq_handler_exit(irq, action, res);
+ } else {
+ unsigned int cpu = smp_processor_id();
+ bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled);
+
+ if (enabled)
+ irq_percpu_disable(desc, cpu);
+
+ pr_err_once("Spurious%s percpu IRQ%u on CPU%u\n",
+ enabled ? " and unmasked" : "", irq, cpu);
+ }
if (chip->irq_eoi)
chip->irq_eoi(&desc->irq_data);
}
-void
+static void
__irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
int is_chained, const char *name)
{
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 74d90a754268..1613bfd48365 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -2,6 +2,7 @@
#include <linux/interrupt.h>
#include <linux/device.h>
#include <linux/gfp.h>
+#include <linux/irq.h>
/*
* Device resource management aware IRQ request/free implementation.
@@ -33,7 +34,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
* @thread_fn: function to be called in a threaded interrupt context. NULL
* for devices which handle everything in @handler
* @irqflags: Interrupt type flags
- * @devname: An ascii name for the claiming device
+ * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
* @dev_id: A cookie passed back to the handler function
*
* Except for the extra @dev argument, this function takes the
@@ -57,6 +58,9 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq,
if (!dr)
return -ENOMEM;
+ if (!devname)
+ devname = dev_name(dev);
+
rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname,
dev_id);
if (rc) {
@@ -80,7 +84,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
* @thread_fn: function to be called in a threaded interrupt context. NULL
* for devices which handle everything in @handler
* @irqflags: Interrupt type flags
- * @devname: An ascii name for the claiming device
+ * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
* @dev_id: A cookie passed back to the handler function
*
* Except for the extra @dev argument, this function takes the
@@ -103,6 +107,9 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq,
if (!dr)
return -ENOMEM;
+ if (!devname)
+ devname = dev_name(dev);
+
rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id);
if (rc < 0) {
devres_free(dr);
@@ -137,3 +144,57 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
free_irq(irq, dev_id);
}
EXPORT_SYMBOL(devm_free_irq);
+
+struct irq_desc_devres {
+ unsigned int from;
+ unsigned int cnt;
+};
+
+static void devm_irq_desc_release(struct device *dev, void *res)
+{
+ struct irq_desc_devres *this = res;
+
+ irq_free_descs(this->from, this->cnt);
+}
+
+/**
+ * __devm_irq_alloc_descs - Allocate and initialize a range of irq descriptors
+ * for a managed device
+ * @dev: Device to allocate the descriptors for
+ * @irq: Allocate for specific irq number if irq >= 0
+ * @from: Start the search from this irq number
+ * @cnt: Number of consecutive irqs to allocate
+ * @node: Preferred node on which the irq descriptor should be allocated
+ * @owner: Owning module (can be NULL)
+ * @affinity: Optional pointer to an affinity mask array of size @cnt
+ * which hints where the irq descriptors should be allocated
+ * and which default affinities to use
+ *
+ * Returns the first irq number or error code.
+ *
+ * Note: Use the provided wrappers (devm_irq_alloc_desc*) for simplicity.
+ */
+int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from,
+ unsigned int cnt, int node, struct module *owner,
+ const struct cpumask *affinity)
+{
+ struct irq_desc_devres *dr;
+ int base;
+
+ dr = devres_alloc(devm_irq_desc_release, sizeof(*dr), GFP_KERNEL);
+ if (!dr)
+ return -ENOMEM;
+
+ base = __irq_alloc_descs(irq, from, cnt, node, owner, affinity);
+ if (base < 0) {
+ devres_free(dr);
+ return base;
+ }
+
+ dr->from = base;
+ dr->cnt = cnt;
+ devres_add(dev, dr);
+
+ return base;
+}
+EXPORT_SYMBOL_GPL(__devm_irq_alloc_descs);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index abd286afbd27..ee32870079c9 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -260,9 +260,9 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
}
/**
- * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
+ * __irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
* @d: irq domain for which to allocate chips
- * @irqs_per_chip: Number of interrupts each chip handles
+ * @irqs_per_chip: Number of interrupts each chip handles (max 32)
* @num_ct: Number of irq_chip_type instances associated with this
* @name: Name of the irq chip
* @handler: Default flow handler associated with these chips
@@ -270,11 +270,11 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
* @set: IRQ_* bits to set in the mapping function
* @gcflags: Generic chip specific setup flags
*/
-int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
- int num_ct, const char *name,
- irq_flow_handler_t handler,
- unsigned int clr, unsigned int set,
- enum irq_gc_flags gcflags)
+int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
+ int num_ct, const char *name,
+ irq_flow_handler_t handler,
+ unsigned int clr, unsigned int set,
+ enum irq_gc_flags gcflags)
{
struct irq_domain_chip_generic *dgc;
struct irq_chip_generic *gc;
@@ -326,7 +326,21 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
d->name = name;
return 0;
}
-EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
+EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips);
+
+static struct irq_chip_generic *
+__irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
+{
+ struct irq_domain_chip_generic *dgc = d->gc;
+ int idx;
+
+ if (!dgc)
+ return ERR_PTR(-ENODEV);
+ idx = hw_irq / dgc->irqs_per_chip;
+ if (idx >= dgc->num_chips)
+ return ERR_PTR(-EINVAL);
+ return dgc->gc[idx];
+}
/**
* irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq
@@ -336,15 +350,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
struct irq_chip_generic *
irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
{
- struct irq_domain_chip_generic *dgc = d->gc;
- int idx;
+ struct irq_chip_generic *gc = __irq_get_domain_generic_chip(d, hw_irq);
- if (!dgc)
- return NULL;
- idx = hw_irq / dgc->irqs_per_chip;
- if (idx >= dgc->num_chips)
- return NULL;
- return dgc->gc[idx];
+ return !IS_ERR(gc) ? gc : NULL;
}
EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
@@ -368,13 +376,9 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
unsigned long flags;
int idx;
- if (!d->gc)
- return -ENODEV;
-
- idx = hw_irq / dgc->irqs_per_chip;
- if (idx >= dgc->num_chips)
- return -EINVAL;
- gc = dgc->gc[idx];
+ gc = __irq_get_domain_generic_chip(d, hw_irq);
+ if (IS_ERR(gc))
+ return PTR_ERR(gc);
idx = hw_irq % dgc->irqs_per_chip;
@@ -409,10 +413,30 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
return 0;
}
-EXPORT_SYMBOL_GPL(irq_map_generic_chip);
+
+static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
+{
+ struct irq_data *data = irq_domain_get_irq_data(d, virq);
+ struct irq_domain_chip_generic *dgc = d->gc;
+ unsigned int hw_irq = data->hwirq;
+ struct irq_chip_generic *gc;
+ int irq_idx;
+
+ gc = irq_get_domain_generic_chip(d, hw_irq);
+ if (!gc)
+ return;
+
+ irq_idx = hw_irq % dgc->irqs_per_chip;
+
+ clear_bit(irq_idx, &gc->installed);
+ irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL,
+ NULL);
+
+}
struct irq_domain_ops irq_generic_chip_ops = {
.map = irq_map_generic_chip,
+ .unmap = irq_unmap_generic_chip,
.xlate = irq_domain_xlate_onetwocell,
};
EXPORT_SYMBOL_GPL(irq_generic_chip_ops);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index a623b44f2d4b..00bb0aeea1d0 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -15,6 +15,7 @@
#include <linux/radix-tree.h>
#include <linux/bitmap.h>
#include <linux/irqdomain.h>
+#include <linux/sysfs.h>
#include "internals.h"
@@ -123,6 +124,181 @@ static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
#ifdef CONFIG_SPARSE_IRQ
+static void irq_kobj_release(struct kobject *kobj);
+
+#ifdef CONFIG_SYSFS
+static struct kobject *irq_kobj_base;
+
+#define IRQ_ATTR_RO(_name) \
+static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+static ssize_t per_cpu_count_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+ int cpu, irq = desc->irq_data.irq;
+ ssize_t ret = 0;
+ char *p = "";
+
+ for_each_possible_cpu(cpu) {
+ unsigned int c = kstat_irqs_cpu(irq, cpu);
+
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c);
+ p = ",";
+ }
+
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ return ret;
+}
+IRQ_ATTR_RO(per_cpu_count);
+
+static ssize_t chip_name_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+ ssize_t ret = 0;
+
+ raw_spin_lock_irq(&desc->lock);
+ if (desc->irq_data.chip && desc->irq_data.chip->name) {
+ ret = scnprintf(buf, PAGE_SIZE, "%s\n",
+ desc->irq_data.chip->name);
+ }
+ raw_spin_unlock_irq(&desc->lock);
+
+ return ret;
+}
+IRQ_ATTR_RO(chip_name);
+
+static ssize_t hwirq_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+ ssize_t ret = 0;
+
+ raw_spin_lock_irq(&desc->lock);
+ if (desc->irq_data.domain)
+ ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq);
+ raw_spin_unlock_irq(&desc->lock);
+
+ return ret;
+}
+IRQ_ATTR_RO(hwirq);
+
+static ssize_t type_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+ ssize_t ret = 0;
+
+ raw_spin_lock_irq(&desc->lock);
+ ret = sprintf(buf, "%s\n",
+ irqd_is_level_type(&desc->irq_data) ? "level" : "edge");
+ raw_spin_unlock_irq(&desc->lock);
+
+ return ret;
+
+}
+IRQ_ATTR_RO(type);
+
+static ssize_t name_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+ ssize_t ret = 0;
+
+ raw_spin_lock_irq(&desc->lock);
+ if (desc->name)
+ ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name);
+ raw_spin_unlock_irq(&desc->lock);
+
+ return ret;
+}
+IRQ_ATTR_RO(name);
+
+static ssize_t actions_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+ struct irqaction *action;
+ ssize_t ret = 0;
+ char *p = "";
+
+ raw_spin_lock_irq(&desc->lock);
+ for (action = desc->action; action != NULL; action = action->next) {
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+ p, action->name);
+ p = ",";
+ }
+ raw_spin_unlock_irq(&desc->lock);
+
+ if (ret)
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+
+ return ret;
+}
+IRQ_ATTR_RO(actions);
+
+static struct attribute *irq_attrs[] = {
+ &per_cpu_count_attr.attr,
+ &chip_name_attr.attr,
+ &hwirq_attr.attr,
+ &type_attr.attr,
+ &name_attr.attr,
+ &actions_attr.attr,
+ NULL
+};
+
+static struct kobj_type irq_kobj_type = {
+ .release = irq_kobj_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_attrs = irq_attrs,
+};
+
+static void irq_sysfs_add(int irq, struct irq_desc *desc)
+{
+ if (irq_kobj_base) {
+ /*
+ * Continue even in case of failure as this is nothing
+ * crucial.
+ */
+ if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq))
+ pr_warn("Failed to add kobject for irq %d\n", irq);
+ }
+}
+
+static int __init irq_sysfs_init(void)
+{
+ struct irq_desc *desc;
+ int irq;
+
+ /* Prevent concurrent irq alloc/free */
+ irq_lock_sparse();
+
+ irq_kobj_base = kobject_create_and_add("irq", kernel_kobj);
+ if (!irq_kobj_base) {
+ irq_unlock_sparse();
+ return -ENOMEM;
+ }
+
+ /* Add the already allocated interrupts */
+ for_each_irq_desc(irq, desc)
+ irq_sysfs_add(irq, desc);
+ irq_unlock_sparse();
+
+ return 0;
+}
+postcore_initcall(irq_sysfs_init);
+
+#else /* !CONFIG_SYSFS */
+
+static struct kobj_type irq_kobj_type = {
+ .release = irq_kobj_release,
+};
+
+static void irq_sysfs_add(int irq, struct irq_desc *desc) {}
+
+#endif /* CONFIG_SYSFS */
+
static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
@@ -187,6 +363,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
desc_set_defaults(irq, desc, node, affinity, owner);
irqd_set(&desc->irq_data, flags);
+ kobject_init(&desc->kobj, &irq_kobj_type);
return desc;
@@ -197,15 +374,22 @@ err_desc:
return NULL;
}
-static void delayed_free_desc(struct rcu_head *rhp)
+static void irq_kobj_release(struct kobject *kobj)
{
- struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
free_masks(desc);
free_percpu(desc->kstat_irqs);
kfree(desc);
}
+static void delayed_free_desc(struct rcu_head *rhp)
+{
+ struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);
+
+ kobject_put(&desc->kobj);
+}
+
static void free_desc(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -217,8 +401,12 @@ static void free_desc(unsigned int irq)
* kstat_irq_usr(). Once we deleted the descriptor from the
* sparse tree we can free it. Access in proc will fail to
* lookup the descriptor.
+ *
+ * The sysfs entry must be serialized against a concurrent
+ * irq_sysfs_init() as well.
*/
mutex_lock(&sparse_irq_lock);
+ kobject_del(&desc->kobj);
delete_irq_desc(irq);
mutex_unlock(&sparse_irq_lock);
@@ -236,31 +424,31 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
const struct cpumask *mask = NULL;
struct irq_desc *desc;
unsigned int flags;
- int i, cpu = -1;
+ int i;
- if (affinity && cpumask_empty(affinity))
- return -EINVAL;
+ /* Validate affinity mask(s) */
+ if (affinity) {
+ for (i = 0, mask = affinity; i < cnt; i++, mask++) {
+ if (cpumask_empty(mask))
+ return -EINVAL;
+ }
+ }
flags = affinity ? IRQD_AFFINITY_MANAGED : 0;
+ mask = NULL;
for (i = 0; i < cnt; i++) {
if (affinity) {
- cpu = cpumask_next(cpu, affinity);
- if (cpu >= nr_cpu_ids)
- cpu = cpumask_first(affinity);
- node = cpu_to_node(cpu);
-
- /*
- * For single allocations we use the caller provided
- * mask otherwise we use the mask of the target cpu
- */
- mask = cnt == 1 ? affinity : cpumask_of(cpu);
+ node = cpu_to_node(cpumask_first(affinity));
+ mask = affinity;
+ affinity++;
}
desc = alloc_desc(start + i, node, flags, mask, owner);
if (!desc)
goto err;
mutex_lock(&sparse_irq_lock);
irq_insert_desc(start + i, desc);
+ irq_sysfs_add(start + i, desc);
mutex_unlock(&sparse_irq_lock);
}
return start;
@@ -481,9 +669,9 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
* @cnt: Number of consecutive irqs to allocate.
* @node: Preferred node on which the irq descriptor should be allocated
* @owner: Owning module (can be NULL)
- * @affinity: Optional pointer to an affinity mask which hints where the
- * irq descriptors should be allocated and which default
- * affinities to use
+ * @affinity: Optional pointer to an affinity mask array of size @cnt which
+ * hints where the irq descriptors should be allocated and which
+ * default affinities to use
*
* Returns the first irq number or error code
*/
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 4752b43662e0..31805f237396 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
/**
* __irq_domain_add() - Allocate a new irq_domain data structure
- * @of_node: optional device-tree node of the interrupt controller
+ * @fwnode: firmware node for the interrupt controller
* @size: Size of linear map; 0 for radix mapping only
* @hwirq_max: Maximum number of interrupts supported by controller
* @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
@@ -96,10 +96,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
const struct irq_domain_ops *ops,
void *host_data)
{
+ struct device_node *of_node = to_of_node(fwnode);
struct irq_domain *domain;
- struct device_node *of_node;
-
- of_node = to_of_node(fwnode);
domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
GFP_KERNEL, of_node_to_nid(of_node));
@@ -280,6 +278,31 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
EXPORT_SYMBOL_GPL(irq_find_matching_fwspec);
/**
+ * irq_domain_check_msi_remap - Check whether all MSI irq domains implement
+ * IRQ remapping
+ *
+ * Return: false if any MSI irq domain does not support IRQ remapping,
+ * true otherwise (including if there is no MSI irq domain)
+ */
+bool irq_domain_check_msi_remap(void)
+{
+ struct irq_domain *h;
+ bool ret = true;
+
+ mutex_lock(&irq_domain_mutex);
+ list_for_each_entry(h, &irq_domain_list, link) {
+ if (irq_domain_is_msi(h) &&
+ !irq_domain_hierarchical_is_msi_remap(h)) {
+ ret = false;
+ break;
+ }
+ }
+ mutex_unlock(&irq_domain_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(irq_domain_check_msi_remap);
+
+/**
* irq_set_default_host() - Set a "default" irq domain
* @domain: default domain pointer
*
@@ -868,7 +891,10 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d,
if (WARN_ON(intsize < 1))
return -EINVAL;
*out_hwirq = intspec[0];
- *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE;
+ if (intsize > 1)
+ *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
+ else
+ *out_type = IRQ_TYPE_NONE;
return 0;
}
EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
@@ -1345,6 +1371,30 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain,
}
EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
+static void __irq_domain_activate_irq(struct irq_data *irq_data)
+{
+ if (irq_data && irq_data->domain) {
+ struct irq_domain *domain = irq_data->domain;
+
+ if (irq_data->parent_data)
+ __irq_domain_activate_irq(irq_data->parent_data);
+ if (domain->ops->activate)
+ domain->ops->activate(domain, irq_data);
+ }
+}
+
+static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
+{
+ if (irq_data && irq_data->domain) {
+ struct irq_domain *domain = irq_data->domain;
+
+ if (domain->ops->deactivate)
+ domain->ops->deactivate(domain, irq_data);
+ if (irq_data->parent_data)
+ __irq_domain_deactivate_irq(irq_data->parent_data);
+ }
+}
+
/**
* irq_domain_activate_irq - Call domain_ops->activate recursively to activate
* interrupt
@@ -1355,13 +1405,9 @@ EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
*/
void irq_domain_activate_irq(struct irq_data *irq_data)
{
- if (irq_data && irq_data->domain) {
- struct irq_domain *domain = irq_data->domain;
-
- if (irq_data->parent_data)
- irq_domain_activate_irq(irq_data->parent_data);
- if (domain->ops->activate)
- domain->ops->activate(domain, irq_data);
+ if (!irqd_is_activated(irq_data)) {
+ __irq_domain_activate_irq(irq_data);
+ irqd_set_activated(irq_data);
}
}
@@ -1375,13 +1421,9 @@ void irq_domain_activate_irq(struct irq_data *irq_data)
*/
void irq_domain_deactivate_irq(struct irq_data *irq_data)
{
- if (irq_data && irq_data->domain) {
- struct irq_domain *domain = irq_data->domain;
-
- if (domain->ops->deactivate)
- domain->ops->deactivate(domain, irq_data);
- if (irq_data->parent_data)
- irq_domain_deactivate_irq(irq_data->parent_data);
+ if (irqd_is_activated(irq_data)) {
+ __irq_domain_deactivate_irq(irq_data);
+ irqd_clr_activated(irq_data);
}
}
@@ -1391,6 +1433,20 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain)
if (domain->ops->alloc)
domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;
}
+
+/**
+ * irq_domain_hierarchical_is_msi_remap - Check if the domain or any
+ * parent has MSI remapping support
+ * @domain: domain pointer
+ */
+bool irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain)
+{
+ for (; domain; domain = domain->parent) {
+ if (irq_domain_is_msi_remap(domain))
+ return true;
+ }
+ return false;
+}
#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
/**
* irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9530fcd27704..6b669593e7eb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -669,8 +669,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
return 0;
}
- flags &= IRQ_TYPE_SENSE_MASK;
-
if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
if (!irqd_irq_masked(&desc->irq_data))
mask_irq(desc);
@@ -678,7 +676,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
unmask = 1;
}
- /* caller masked out all except trigger mode flags */
+ /* Mask all flags except trigger mode */
+ flags &= IRQ_TYPE_SENSE_MASK;
ret = chip->irq_set_type(&desc->irq_data, flags);
switch (ret) {
@@ -722,6 +721,7 @@ int irq_set_parent(int irq, int parent_irq)
irq_put_desc_unlock(desc, flags);
return 0;
}
+EXPORT_SYMBOL_GPL(irq_set_parent);
#endif
/*
@@ -1341,12 +1341,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
} else if (new->flags & IRQF_TRIGGER_MASK) {
unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
- unsigned int omsk = irq_settings_get_trigger_mask(desc);
+ unsigned int omsk = irqd_get_trigger_type(&desc->irq_data);
if (nmsk != omsk)
/* hope the handler works with current trigger mode */
pr_warn("irq %d uses trigger mode %u; requested %u\n",
- irq, nmsk, omsk);
+ irq, omsk, nmsk);
}
*old_ptr = new;
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 19e9dfbe97fa..ddc2f5427f75 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -14,24 +14,44 @@
#include <linux/irq.h>
#include <linux/irqdomain.h>
#include <linux/msi.h>
+#include <linux/slab.h>
-/* Temparory solution for building, will be removed later */
-#include <linux/pci.h>
-
-struct msi_desc *alloc_msi_entry(struct device *dev)
+/**
+ * alloc_msi_entry - Allocate an initialize msi_entry
+ * @dev: Pointer to the device for which this is allocated
+ * @nvec: The number of vectors used in this entry
+ * @affinity: Optional pointer to an affinity mask array size of @nvec
+ *
+ * If @affinity is not NULL then a an affinity array[@nvec] is allocated
+ * and the affinity masks from @affinity are copied.
+ */
+struct msi_desc *
+alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity)
{
- struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+ struct msi_desc *desc;
+
+ desc = kzalloc(sizeof(*desc), GFP_KERNEL);
if (!desc)
return NULL;
INIT_LIST_HEAD(&desc->list);
desc->dev = dev;
+ desc->nvec_used = nvec;
+ if (affinity) {
+ desc->affinity = kmemdup(affinity,
+ nvec * sizeof(*desc->affinity), GFP_KERNEL);
+ if (!desc->affinity) {
+ kfree(desc);
+ return NULL;
+ }
+ }
return desc;
}
void free_msi_entry(struct msi_desc *entry)
{
+ kfree(entry->affinity);
kfree(entry);
}
@@ -250,8 +270,8 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
msi_domain_update_chip_ops(info);
- return irq_domain_create_hierarchy(parent, 0, 0, fwnode,
- &msi_domain_ops, info);
+ return irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0,
+ fwnode, &msi_domain_ops, info);
}
int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index feaa813b84a9..c53edad7b459 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -487,6 +487,8 @@ int show_interrupts(struct seq_file *p, void *v)
}
if (desc->irq_data.domain)
seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
+ else
+ seq_printf(p, " %*s", prec, "");
#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
#endif
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 5707f97a3e6a..061ba7eed4ed 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -175,7 +175,9 @@ out:
static inline int bad_action_ret(irqreturn_t action_ret)
{
- if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
+ unsigned int r = action_ret;
+
+ if (likely(r <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
return 0;
return 1;
}
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 93ad6c1fb9b6..a9b8cf500591 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -182,6 +182,13 @@ void static_key_slow_dec_deferred(struct static_key_deferred *key)
}
EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
+void static_key_deferred_flush(struct static_key_deferred *key)
+{
+ STATIC_KEY_CHECK_USE();
+ flush_delayed_work(&key->work);
+}
+EXPORT_SYMBOL_GPL(static_key_deferred_flush);
+
void jump_label_rate_limit(struct static_key_deferred *key,
unsigned long rl)
{
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 8d44b3fea9d0..85e5546cd791 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -1,18 +1,25 @@
#define pr_fmt(fmt) "kcov: " fmt
#define DISABLE_BRANCH_PROFILING
+#include <linux/atomic.h>
#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/export.h>
#include <linux/types.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/init.h>
#include <linux/mm.h>
+#include <linux/preempt.h>
#include <linux/printk.h>
+#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/kcov.h>
+#include <asm/setup.h>
/*
* kcov descriptor (one per opened debugfs file).
@@ -53,13 +60,25 @@ void notrace __sanitizer_cov_trace_pc(void)
/*
* We are interested in code coverage as a function of a syscall inputs,
* so we ignore code executed in interrupts.
+ * The checks for whether we are in an interrupt are open-coded, because
+ * 1. We can't use in_interrupt() here, since it also returns true
+ * when we are inside local_bh_disable() section.
+ * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()),
+ * since that leads to slower generated code (three separate tests,
+ * one for each of the flags).
*/
- if (!t || in_interrupt())
+ if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET
+ | NMI_MASK)))
return;
mode = READ_ONCE(t->kcov_mode);
if (mode == KCOV_MODE_TRACE) {
unsigned long *area;
unsigned long pos;
+ unsigned long ip = _RET_IP_;
+
+#ifdef CONFIG_RANDOMIZE_BASE
+ ip -= kaslr_offset();
+#endif
/*
* There is some code that runs in interrupts but for which
@@ -73,7 +92,7 @@ void notrace __sanitizer_cov_trace_pc(void)
/* The first word is number of subsequent PCs. */
pos = READ_ONCE(area[0]) + 1;
if (likely(pos < t->kcov_size)) {
- area[pos] = _RET_IP_;
+ area[pos] = ip;
WRITE_ONCE(area[0], pos);
}
}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 561675589511..5617cc412444 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -441,6 +441,8 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
while (hole_end <= crashk_res.end) {
unsigned long i;
+ cond_resched();
+
if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
break;
/* See if I overlap any of the segments */
@@ -1467,9 +1469,6 @@ static int __init crash_save_vmcoreinfo_init(void)
#endif
VMCOREINFO_NUMBER(PG_head_mask);
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_X86
- VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
-#endif
#ifdef CONFIG_HUGETLB_PAGE
VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
#endif
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 037c321c5618..b56a558e406d 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -19,6 +19,7 @@
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/fs.h>
+#include <linux/ima.h>
#include <crypto/hash.h>
#include <crypto/sha.h>
#include <linux/syscalls.h>
@@ -132,6 +133,9 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
return ret;
image->kernel_buf_len = size;
+ /* IMA needs to pass the measurement list to the next kernel. */
+ ima_add_kexec_buffer(image);
+
/* Call arch image probe handlers */
ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
image->kernel_buf_len);
@@ -428,25 +432,65 @@ static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
return locate_mem_hole_bottom_up(start, end, kbuf);
}
-/*
- * Helper function for placing a buffer in a kexec segment. This assumes
- * that kexec_mutex is held.
+/**
+ * arch_kexec_walk_mem - call func(data) on free memory regions
+ * @kbuf: Context info for the search. Also passed to @func.
+ * @func: Function to call for each memory region.
+ *
+ * Return: The memory walk will stop when func returns a non-zero value
+ * and that value will be returned. If all free regions are visited without
+ * func returning non-zero, then zero will be returned.
+ */
+int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
+ int (*func)(u64, u64, void *))
+{
+ if (kbuf->image->type == KEXEC_TYPE_CRASH)
+ return walk_iomem_res_desc(crashk_res.desc,
+ IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
+ crashk_res.start, crashk_res.end,
+ kbuf, func);
+ else
+ return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
+}
+
+/**
+ * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel
+ * @kbuf: Parameters for the memory search.
+ *
+ * On success, kbuf->mem will have the start address of the memory region found.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int kexec_locate_mem_hole(struct kexec_buf *kbuf)
+{
+ int ret;
+
+ ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback);
+
+ return ret == 1 ? 0 : -EADDRNOTAVAIL;
+}
+
+/**
+ * kexec_add_buffer - place a buffer in a kexec segment
+ * @kbuf: Buffer contents and memory parameters.
+ *
+ * This function assumes that kexec_mutex is held.
+ * On successful return, @kbuf->mem will have the physical address of
+ * the buffer in memory.
+ *
+ * Return: 0 on success, negative errno on error.
*/
-int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
- unsigned long memsz, unsigned long buf_align,
- unsigned long buf_min, unsigned long buf_max,
- bool top_down, unsigned long *load_addr)
+int kexec_add_buffer(struct kexec_buf *kbuf)
{
struct kexec_segment *ksegment;
- struct kexec_buf buf, *kbuf;
int ret;
/* Currently adding segment this way is allowed only in file mode */
- if (!image->file_mode)
+ if (!kbuf->image->file_mode)
return -EINVAL;
- if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+ if (kbuf->image->nr_segments >= KEXEC_SEGMENT_MAX)
return -EINVAL;
/*
@@ -456,45 +500,27 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
* logic goes through list of segments to make sure there are
* no destination overlaps.
*/
- if (!list_empty(&image->control_pages)) {
+ if (!list_empty(&kbuf->image->control_pages)) {
WARN_ON(1);
return -EINVAL;
}
- memset(&buf, 0, sizeof(struct kexec_buf));
- kbuf = &buf;
- kbuf->image = image;
- kbuf->buffer = buffer;
- kbuf->bufsz = bufsz;
-
- kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
- kbuf->buf_align = max(buf_align, PAGE_SIZE);
- kbuf->buf_min = buf_min;
- kbuf->buf_max = buf_max;
- kbuf->top_down = top_down;
+ /* Ensure minimum alignment needed for segments. */
+ kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE);
+ kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
/* Walk the RAM ranges and allocate a suitable range for the buffer */
- if (image->type == KEXEC_TYPE_CRASH)
- ret = walk_iomem_res_desc(crashk_res.desc,
- IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
- crashk_res.start, crashk_res.end, kbuf,
- locate_mem_hole_callback);
- else
- ret = walk_system_ram_res(0, -1, kbuf,
- locate_mem_hole_callback);
- if (ret != 1) {
- /* A suitable memory range could not be found for buffer */
- return -EADDRNOTAVAIL;
- }
+ ret = kexec_locate_mem_hole(kbuf);
+ if (ret)
+ return ret;
/* Found a suitable memory range */
- ksegment = &image->segment[image->nr_segments];
+ ksegment = &kbuf->image->segment[kbuf->image->nr_segments];
ksegment->kbuf = kbuf->buffer;
ksegment->bufsz = kbuf->bufsz;
ksegment->mem = kbuf->mem;
ksegment->memsz = kbuf->memsz;
- image->nr_segments++;
- *load_addr = ksegment->mem;
+ kbuf->image->nr_segments++;
return 0;
}
@@ -616,13 +642,15 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
unsigned long max, int top_down)
{
struct purgatory_info *pi = &image->purgatory_info;
- unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
- unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+ unsigned long align, bss_align, bss_sz, bss_pad;
+ unsigned long entry, load_addr, curr_load_addr, bss_addr, offset;
unsigned char *buf_addr, *src;
int i, ret = 0, entry_sidx = -1;
const Elf_Shdr *sechdrs_c;
Elf_Shdr *sechdrs = NULL;
- void *purgatory_buf = NULL;
+ struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1,
+ .buf_min = min, .buf_max = max,
+ .top_down = top_down };
/*
* sechdrs_c points to section headers in purgatory and are read
@@ -688,9 +716,7 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
}
/* Determine how much memory is needed to load relocatable object. */
- buf_align = 1;
bss_align = 1;
- buf_sz = 0;
bss_sz = 0;
for (i = 0; i < pi->ehdr->e_shnum; i++) {
@@ -699,10 +725,10 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
align = sechdrs[i].sh_addralign;
if (sechdrs[i].sh_type != SHT_NOBITS) {
- if (buf_align < align)
- buf_align = align;
- buf_sz = ALIGN(buf_sz, align);
- buf_sz += sechdrs[i].sh_size;
+ if (kbuf.buf_align < align)
+ kbuf.buf_align = align;
+ kbuf.bufsz = ALIGN(kbuf.bufsz, align);
+ kbuf.bufsz += sechdrs[i].sh_size;
} else {
/* bss section */
if (bss_align < align)
@@ -714,32 +740,31 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
/* Determine the bss padding required to align bss properly */
bss_pad = 0;
- if (buf_sz & (bss_align - 1))
- bss_pad = bss_align - (buf_sz & (bss_align - 1));
+ if (kbuf.bufsz & (bss_align - 1))
+ bss_pad = bss_align - (kbuf.bufsz & (bss_align - 1));
- memsz = buf_sz + bss_pad + bss_sz;
+ kbuf.memsz = kbuf.bufsz + bss_pad + bss_sz;
/* Allocate buffer for purgatory */
- purgatory_buf = vzalloc(buf_sz);
- if (!purgatory_buf) {
+ kbuf.buffer = vzalloc(kbuf.bufsz);
+ if (!kbuf.buffer) {
ret = -ENOMEM;
goto out;
}
- if (buf_align < bss_align)
- buf_align = bss_align;
+ if (kbuf.buf_align < bss_align)
+ kbuf.buf_align = bss_align;
/* Add buffer to segment list */
- ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
- buf_align, min, max, top_down,
- &pi->purgatory_load_addr);
+ ret = kexec_add_buffer(&kbuf);
if (ret)
goto out;
+ pi->purgatory_load_addr = kbuf.mem;
/* Load SHF_ALLOC sections */
- buf_addr = purgatory_buf;
+ buf_addr = kbuf.buffer;
load_addr = curr_load_addr = pi->purgatory_load_addr;
- bss_addr = load_addr + buf_sz + bss_pad;
+ bss_addr = load_addr + kbuf.bufsz + bss_pad;
for (i = 0; i < pi->ehdr->e_shnum; i++) {
if (!(sechdrs[i].sh_flags & SHF_ALLOC))
@@ -785,11 +810,11 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
* Used later to identify which section is purgatory and skip it
* from checksumming.
*/
- pi->purgatory_buf = purgatory_buf;
+ pi->purgatory_buf = kbuf.buffer;
return ret;
out:
vfree(sechdrs);
- vfree(purgatory_buf);
+ vfree(kbuf.buffer);
return ret;
}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 0a52315d9c62..4cef7e4706b0 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -20,22 +20,6 @@ struct kexec_sha_region {
unsigned long len;
};
-/*
- * Keeps track of buffer parameters as provided by caller for requesting
- * memory placement of buffer.
- */
-struct kexec_buf {
- struct kimage *image;
- char *buffer;
- unsigned long bufsz;
- unsigned long mem;
- unsigned long memsz;
- unsigned long buf_align;
- unsigned long buf_min;
- unsigned long buf_max;
- bool top_down; /* allocate from top of memory hole */
-};
-
void kimage_file_post_load_cleanup(struct kimage *image);
#else /* CONFIG_KEXEC_FILE */
static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 0277d1216f80..d45c96073afb 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -39,7 +39,7 @@
#include <linux/rwsem.h>
#include <linux/ptrace.h>
#include <linux/async.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <trace/events/module.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d10ab6b9b5e0..ebb4dadca66b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -49,10 +49,10 @@
#include <linux/cpu.h>
#include <linux/jump_label.h>
-#include <asm-generic/sections.h>
+#include <asm/sections.h>
#include <asm/cacheflush.h>
#include <asm/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
@@ -149,9 +149,11 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
struct kprobe_insn_page *kip;
kprobe_opcode_t *slot = NULL;
+ /* Since the slot array is not protected by rcu, we need a mutex */
mutex_lock(&c->mutex);
retry:
- list_for_each_entry(kip, &c->pages, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(kip, &c->pages, list) {
if (kip->nused < slots_per_page(c)) {
int i;
for (i = 0; i < slots_per_page(c); i++) {
@@ -159,6 +161,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
kip->slot_used[i] = SLOT_USED;
kip->nused++;
slot = kip->insns + (i * c->insn_size);
+ rcu_read_unlock();
goto out;
}
}
@@ -167,6 +170,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
WARN_ON(1);
}
}
+ rcu_read_unlock();
/* If there are any garbage slots, collect it and try again. */
if (c->nr_garbage && collect_garbage_slots(c) == 0)
@@ -193,7 +197,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
kip->nused = 1;
kip->ngarbage = 0;
kip->cache = c;
- list_add(&kip->list, &c->pages);
+ list_add_rcu(&kip->list, &c->pages);
slot = kip->insns;
out:
mutex_unlock(&c->mutex);
@@ -213,7 +217,8 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
* next time somebody inserts a probe.
*/
if (!list_is_singular(&kip->list)) {
- list_del(&kip->list);
+ list_del_rcu(&kip->list);
+ synchronize_rcu();
kip->cache->free(kip->insns);
kfree(kip);
}
@@ -235,8 +240,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c)
continue;
kip->ngarbage = 0; /* we will collect all garbages */
for (i = 0; i < slots_per_page(c); i++) {
- if (kip->slot_used[i] == SLOT_DIRTY &&
- collect_one_slot(kip, i))
+ if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i))
break;
}
}
@@ -248,29 +252,60 @@ void __free_insn_slot(struct kprobe_insn_cache *c,
kprobe_opcode_t *slot, int dirty)
{
struct kprobe_insn_page *kip;
+ long idx;
mutex_lock(&c->mutex);
- list_for_each_entry(kip, &c->pages, list) {
- long idx = ((long)slot - (long)kip->insns) /
- (c->insn_size * sizeof(kprobe_opcode_t));
- if (idx >= 0 && idx < slots_per_page(c)) {
- WARN_ON(kip->slot_used[idx] != SLOT_USED);
- if (dirty) {
- kip->slot_used[idx] = SLOT_DIRTY;
- kip->ngarbage++;
- if (++c->nr_garbage > slots_per_page(c))
- collect_garbage_slots(c);
- } else
- collect_one_slot(kip, idx);
+ rcu_read_lock();
+ list_for_each_entry_rcu(kip, &c->pages, list) {
+ idx = ((long)slot - (long)kip->insns) /
+ (c->insn_size * sizeof(kprobe_opcode_t));
+ if (idx >= 0 && idx < slots_per_page(c))
goto out;
- }
}
- /* Could not free this slot. */
+ /* Could not find this slot. */
WARN_ON(1);
+ kip = NULL;
out:
+ rcu_read_unlock();
+ /* Mark and sweep: this may sleep */
+ if (kip) {
+ /* Check double free */
+ WARN_ON(kip->slot_used[idx] != SLOT_USED);
+ if (dirty) {
+ kip->slot_used[idx] = SLOT_DIRTY;
+ kip->ngarbage++;
+ if (++c->nr_garbage > slots_per_page(c))
+ collect_garbage_slots(c);
+ } else {
+ collect_one_slot(kip, idx);
+ }
+ }
mutex_unlock(&c->mutex);
}
+/*
+ * Check given address is on the page of kprobe instruction slots.
+ * This will be used for checking whether the address on a stack
+ * is on a text area or not.
+ */
+bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
+{
+ struct kprobe_insn_page *kip;
+ bool ret = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(kip, &c->pages, list) {
+ if (addr >= (unsigned long)kip->insns &&
+ addr < (unsigned long)kip->insns + PAGE_SIZE) {
+ ret = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
#ifdef CONFIG_OPTPROBES
/* For optimized_kprobe buffer */
struct kprobe_insn_cache kprobe_optinsn_slots = {
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9ff173dca1ae..8461a4372e8a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -53,20 +53,29 @@ enum KTHREAD_BITS {
KTHREAD_IS_PARKED,
};
-#define __to_kthread(vfork) \
- container_of(vfork, struct kthread, exited)
+static inline void set_kthread_struct(void *kthread)
+{
+ /*
+ * We abuse ->set_child_tid to avoid the new member and because it
+ * can't be wrongly copied by copy_process(). We also rely on fact
+ * that the caller can't exec, so PF_KTHREAD can't be cleared.
+ */
+ current->set_child_tid = (__force void __user *)kthread;
+}
static inline struct kthread *to_kthread(struct task_struct *k)
{
- return __to_kthread(k->vfork_done);
+ WARN_ON(!(k->flags & PF_KTHREAD));
+ return (__force void *)k->set_child_tid;
}
-static struct kthread *to_live_kthread(struct task_struct *k)
+void free_kthread_struct(struct task_struct *k)
{
- struct completion *vfork = ACCESS_ONCE(k->vfork_done);
- if (likely(vfork))
- return __to_kthread(vfork);
- return NULL;
+ /*
+ * Can be NULL if this kthread was created by kernel_thread()
+ * or if kmalloc() in kthread() failed.
+ */
+ kfree(to_kthread(k));
}
/**
@@ -138,7 +147,7 @@ void *kthread_data(struct task_struct *task)
}
/**
- * probe_kthread_data - speculative version of kthread_data()
+ * kthread_probe_data - speculative version of kthread_data()
* @task: possible kthread task in question
*
* @task could be a kthread task. Return the data value specified when it
@@ -146,7 +155,7 @@ void *kthread_data(struct task_struct *task)
* inaccessible for any reason, %NULL is returned. This function requires
* that @task itself is safe to dereference.
*/
-void *probe_kthread_data(struct task_struct *task)
+void *kthread_probe_data(struct task_struct *task)
{
struct kthread *kthread = to_kthread(task);
void *data = NULL;
@@ -181,14 +190,11 @@ static int kthread(void *_create)
int (*threadfn)(void *data) = create->threadfn;
void *data = create->data;
struct completion *done;
- struct kthread self;
+ struct kthread *self;
int ret;
- self.flags = 0;
- self.data = data;
- init_completion(&self.exited);
- init_completion(&self.parked);
- current->vfork_done = &self.exited;
+ self = kmalloc(sizeof(*self), GFP_KERNEL);
+ set_kthread_struct(self);
/* If user was SIGKILLed, I release the structure. */
done = xchg(&create->done, NULL);
@@ -196,6 +202,19 @@ static int kthread(void *_create)
kfree(create);
do_exit(-EINTR);
}
+
+ if (!self) {
+ create->result = ERR_PTR(-ENOMEM);
+ complete(done);
+ do_exit(-ENOMEM);
+ }
+
+ self->flags = 0;
+ self->data = data;
+ init_completion(&self->exited);
+ init_completion(&self->parked);
+ current->vfork_done = &self->exited;
+
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
create->result = current;
@@ -203,12 +222,10 @@ static int kthread(void *_create)
schedule();
ret = -EINTR;
-
- if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
- __kthread_parkme(&self);
+ if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
+ __kthread_parkme(self);
ret = threadfn(data);
}
- /* we can't just return, we must preserve "self" on stack */
do_exit(ret);
}
@@ -244,33 +261,11 @@ static void create_kthread(struct kthread_create_info *create)
}
}
-/**
- * kthread_create_on_node - create a kthread.
- * @threadfn: the function to run until signal_pending(current).
- * @data: data ptr for @threadfn.
- * @node: task and thread structures for the thread are allocated on this node
- * @namefmt: printf-style name for the thread.
- *
- * Description: This helper function creates and names a kernel
- * thread. The thread will be stopped: use wake_up_process() to start
- * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and
- * is affine to all CPUs.
- *
- * If thread is going to be bound on a particular cpu, give its node
- * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
- * When woken, the thread will run @threadfn() with @data as its
- * argument. @threadfn() can either call do_exit() directly if it is a
- * standalone thread for which no one will call kthread_stop(), or
- * return when 'kthread_should_stop()' is true (which means
- * kthread_stop() has been called). The return value should be zero
- * or a negative error number; it will be passed to kthread_stop().
- *
- * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
- */
-struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
- void *data, int node,
- const char namefmt[],
- ...)
+static __printf(4, 0)
+struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
+ void *data, int node,
+ const char namefmt[],
+ va_list args)
{
DECLARE_COMPLETION_ONSTACK(done);
struct task_struct *task;
@@ -311,11 +306,8 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
task = create->result;
if (!IS_ERR(task)) {
static const struct sched_param param = { .sched_priority = 0 };
- va_list args;
- va_start(args, namefmt);
vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
- va_end(args);
/*
* root may have changed our (kthreadd's) priority or CPU mask.
* The kernel thread should not inherit these properties.
@@ -326,6 +318,44 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
kfree(create);
return task;
}
+
+/**
+ * kthread_create_on_node - create a kthread.
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @node: task and thread structures for the thread are allocated on this node
+ * @namefmt: printf-style name for the thread.
+ *
+ * Description: This helper function creates and names a kernel
+ * thread. The thread will be stopped: use wake_up_process() to start
+ * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and
+ * is affine to all CPUs.
+ *
+ * If thread is going to be bound on a particular cpu, give its node
+ * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
+ * When woken, the thread will run @threadfn() with @data as its
+ * argument. @threadfn() can either call do_exit() directly if it is a
+ * standalone thread for which no one will call kthread_stop(), or
+ * return when 'kthread_should_stop()' is true (which means
+ * kthread_stop() has been called). The return value should be zero
+ * or a negative error number; it will be passed to kthread_stop().
+ *
+ * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
+ */
+struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
+ void *data, int node,
+ const char namefmt[],
+ ...)
+{
+ struct task_struct *task;
+ va_list args;
+
+ va_start(args, namefmt);
+ task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
+ va_end(args);
+
+ return task;
+}
EXPORT_SYMBOL(kthread_create_on_node);
static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
@@ -390,15 +420,25 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
cpu);
if (IS_ERR(p))
return p;
+ kthread_bind(p, cpu);
+ /* CPU hotplug need to bind once again when unparking the thread. */
set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
to_kthread(p)->cpu = cpu;
- /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */
- kthread_park(p);
return p;
}
-static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
+/**
+ * kthread_unpark - unpark a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_park() for @k to return false, wakes it, and
+ * waits for it to return. If the thread is marked percpu then its
+ * bound to the cpu again.
+ */
+void kthread_unpark(struct task_struct *k)
{
+ struct kthread *kthread = to_kthread(k);
+
clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
/*
* We clear the IS_PARKED bit here as we don't wait
@@ -407,27 +447,15 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
* which might be about to be cleared.
*/
if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+ /*
+ * Newly created kthread was parked when the CPU was offline.
+ * The binding was lost and we need to set it again.
+ */
if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
__kthread_bind(k, kthread->cpu, TASK_PARKED);
wake_up_state(k, TASK_PARKED);
}
}
-
-/**
- * kthread_unpark - unpark a thread created by kthread_create().
- * @k: thread created by kthread_create().
- *
- * Sets kthread_should_park() for @k to return false, wakes it, and
- * waits for it to return. If the thread is marked percpu then its
- * bound to the cpu again.
- */
-void kthread_unpark(struct task_struct *k)
-{
- struct kthread *kthread = to_live_kthread(k);
-
- if (kthread)
- __kthread_unpark(k, kthread);
-}
EXPORT_SYMBOL_GPL(kthread_unpark);
/**
@@ -444,20 +472,20 @@ EXPORT_SYMBOL_GPL(kthread_unpark);
*/
int kthread_park(struct task_struct *k)
{
- struct kthread *kthread = to_live_kthread(k);
- int ret = -ENOSYS;
+ struct kthread *kthread = to_kthread(k);
+
+ if (WARN_ON(k->flags & PF_EXITING))
+ return -ENOSYS;
- if (kthread) {
- if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
- set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
- if (k != current) {
- wake_up_process(k);
- wait_for_completion(&kthread->parked);
- }
+ if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+ set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ if (k != current) {
+ wake_up_process(k);
+ wait_for_completion(&kthread->parked);
}
- ret = 0;
}
- return ret;
+
+ return 0;
}
EXPORT_SYMBOL_GPL(kthread_park);
@@ -484,13 +512,11 @@ int kthread_stop(struct task_struct *k)
trace_sched_kthread_stop(k);
get_task_struct(k);
- kthread = to_live_kthread(k);
- if (kthread) {
- set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
- __kthread_unpark(k, kthread);
- wake_up_process(k);
- wait_for_completion(&kthread->exited);
- }
+ kthread = to_kthread(k);
+ set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
+ kthread_unpark(k);
+ wake_up_process(k);
+ wait_for_completion(&kthread->exited);
ret = k->exit_code;
put_task_struct(k);
@@ -536,39 +562,48 @@ int kthreadd(void *unused)
return 0;
}
-void __init_kthread_worker(struct kthread_worker *worker,
+void __kthread_init_worker(struct kthread_worker *worker,
const char *name,
struct lock_class_key *key)
{
+ memset(worker, 0, sizeof(struct kthread_worker));
spin_lock_init(&worker->lock);
lockdep_set_class_and_name(&worker->lock, key, name);
INIT_LIST_HEAD(&worker->work_list);
- worker->task = NULL;
+ INIT_LIST_HEAD(&worker->delayed_work_list);
}
-EXPORT_SYMBOL_GPL(__init_kthread_worker);
+EXPORT_SYMBOL_GPL(__kthread_init_worker);
/**
* kthread_worker_fn - kthread function to process kthread_worker
* @worker_ptr: pointer to initialized kthread_worker
*
- * This function can be used as @threadfn to kthread_create() or
- * kthread_run() with @worker_ptr argument pointing to an initialized
- * kthread_worker. The started kthread will process work_list until
- * the it is stopped with kthread_stop(). A kthread can also call
- * this function directly after extra initialization.
+ * This function implements the main cycle of kthread worker. It processes
+ * work_list until it is stopped with kthread_stop(). It sleeps when the queue
+ * is empty.
*
- * Different kthreads can be used for the same kthread_worker as long
- * as there's only one kthread attached to it at any given time. A
- * kthread_worker without an attached kthread simply collects queued
- * kthread_works.
+ * The works are not allowed to keep any locks, disable preemption or interrupts
+ * when they finish. There is defined a safe point for freezing when one work
+ * finishes and before a new one is started.
+ *
+ * Also the works must not be handled by more than one worker at the same time,
+ * see also kthread_queue_work().
*/
int kthread_worker_fn(void *worker_ptr)
{
struct kthread_worker *worker = worker_ptr;
struct kthread_work *work;
- WARN_ON(worker->task);
+ /*
+ * FIXME: Update the check and remove the assignment when all kthread
+ * worker users are created using kthread_create_worker*() functions.
+ */
+ WARN_ON(worker->task && worker->task != current);
worker->task = current;
+
+ if (worker->flags & KTW_FREEZABLE)
+ set_freezable();
+
repeat:
set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
@@ -601,12 +636,124 @@ repeat:
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);
-/* insert @work before @pos in @worker */
-static void insert_kthread_work(struct kthread_worker *worker,
- struct kthread_work *work,
- struct list_head *pos)
+static __printf(3, 0) struct kthread_worker *
+__kthread_create_worker(int cpu, unsigned int flags,
+ const char namefmt[], va_list args)
+{
+ struct kthread_worker *worker;
+ struct task_struct *task;
+ int node = -1;
+
+ worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+ if (!worker)
+ return ERR_PTR(-ENOMEM);
+
+ kthread_init_worker(worker);
+
+ if (cpu >= 0)
+ node = cpu_to_node(cpu);
+
+ task = __kthread_create_on_node(kthread_worker_fn, worker,
+ node, namefmt, args);
+ if (IS_ERR(task))
+ goto fail_task;
+
+ if (cpu >= 0)
+ kthread_bind(task, cpu);
+
+ worker->flags = flags;
+ worker->task = task;
+ wake_up_process(task);
+ return worker;
+
+fail_task:
+ kfree(worker);
+ return ERR_CAST(task);
+}
+
+/**
+ * kthread_create_worker - create a kthread worker
+ * @flags: flags modifying the default behavior of the worker
+ * @namefmt: printf-style name for the kthread worker (task).
+ *
+ * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
+ * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
+ * when the worker was SIGKILLed.
+ */
+struct kthread_worker *
+kthread_create_worker(unsigned int flags, const char namefmt[], ...)
+{
+ struct kthread_worker *worker;
+ va_list args;
+
+ va_start(args, namefmt);
+ worker = __kthread_create_worker(-1, flags, namefmt, args);
+ va_end(args);
+
+ return worker;
+}
+EXPORT_SYMBOL(kthread_create_worker);
+
+/**
+ * kthread_create_worker_on_cpu - create a kthread worker and bind it
+ * it to a given CPU and the associated NUMA node.
+ * @cpu: CPU number
+ * @flags: flags modifying the default behavior of the worker
+ * @namefmt: printf-style name for the kthread worker (task).
+ *
+ * Use a valid CPU number if you want to bind the kthread worker
+ * to the given CPU and the associated NUMA node.
+ *
+ * A good practice is to add the cpu number also into the worker name.
+ * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu).
+ *
+ * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
+ * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
+ * when the worker was SIGKILLed.
+ */
+struct kthread_worker *
+kthread_create_worker_on_cpu(int cpu, unsigned int flags,
+ const char namefmt[], ...)
+{
+ struct kthread_worker *worker;
+ va_list args;
+
+ va_start(args, namefmt);
+ worker = __kthread_create_worker(cpu, flags, namefmt, args);
+ va_end(args);
+
+ return worker;
+}
+EXPORT_SYMBOL(kthread_create_worker_on_cpu);
+
+/*
+ * Returns true when the work could not be queued at the moment.
+ * It happens when it is already pending in a worker list
+ * or when it is being cancelled.
+ */
+static inline bool queuing_blocked(struct kthread_worker *worker,
+ struct kthread_work *work)
+{
+ lockdep_assert_held(&worker->lock);
+
+ return !list_empty(&work->node) || work->canceling;
+}
+
+static void kthread_insert_work_sanity_check(struct kthread_worker *worker,
+ struct kthread_work *work)
{
lockdep_assert_held(&worker->lock);
+ WARN_ON_ONCE(!list_empty(&work->node));
+ /* Do not use a work with >1 worker, see kthread_queue_work() */
+ WARN_ON_ONCE(work->worker && work->worker != worker);
+}
+
+/* insert @work before @pos in @worker */
+static void kthread_insert_work(struct kthread_worker *worker,
+ struct kthread_work *work,
+ struct list_head *pos)
+{
+ kthread_insert_work_sanity_check(worker, work);
list_add_tail(&work->node, pos);
work->worker = worker;
@@ -615,29 +762,132 @@ static void insert_kthread_work(struct kthread_worker *worker,
}
/**
- * queue_kthread_work - queue a kthread_work
+ * kthread_queue_work - queue a kthread_work
* @worker: target kthread_worker
* @work: kthread_work to queue
*
* Queue @work to work processor @task for async execution. @task
* must have been created with kthread_worker_create(). Returns %true
* if @work was successfully queued, %false if it was already pending.
+ *
+ * Reinitialize the work if it needs to be used by another worker.
+ * For example, when the worker was stopped and started again.
*/
-bool queue_kthread_work(struct kthread_worker *worker,
+bool kthread_queue_work(struct kthread_worker *worker,
struct kthread_work *work)
{
bool ret = false;
unsigned long flags;
spin_lock_irqsave(&worker->lock, flags);
- if (list_empty(&work->node)) {
- insert_kthread_work(worker, work, &worker->work_list);
+ if (!queuing_blocked(worker, work)) {
+ kthread_insert_work(worker, work, &worker->work_list);
ret = true;
}
spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
-EXPORT_SYMBOL_GPL(queue_kthread_work);
+EXPORT_SYMBOL_GPL(kthread_queue_work);
+
+/**
+ * kthread_delayed_work_timer_fn - callback that queues the associated kthread
+ * delayed work when the timer expires.
+ * @__data: pointer to the data associated with the timer
+ *
+ * The format of the function is defined by struct timer_list.
+ * It should have been called from irqsafe timer with irq already off.
+ */
+void kthread_delayed_work_timer_fn(unsigned long __data)
+{
+ struct kthread_delayed_work *dwork =
+ (struct kthread_delayed_work *)__data;
+ struct kthread_work *work = &dwork->work;
+ struct kthread_worker *worker = work->worker;
+
+ /*
+ * This might happen when a pending work is reinitialized.
+ * It means that it is used a wrong way.
+ */
+ if (WARN_ON_ONCE(!worker))
+ return;
+
+ spin_lock(&worker->lock);
+ /* Work must not be used with >1 worker, see kthread_queue_work(). */
+ WARN_ON_ONCE(work->worker != worker);
+
+ /* Move the work from worker->delayed_work_list. */
+ WARN_ON_ONCE(list_empty(&work->node));
+ list_del_init(&work->node);
+ kthread_insert_work(worker, work, &worker->work_list);
+
+ spin_unlock(&worker->lock);
+}
+EXPORT_SYMBOL(kthread_delayed_work_timer_fn);
+
+void __kthread_queue_delayed_work(struct kthread_worker *worker,
+ struct kthread_delayed_work *dwork,
+ unsigned long delay)
+{
+ struct timer_list *timer = &dwork->timer;
+ struct kthread_work *work = &dwork->work;
+
+ WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn ||
+ timer->data != (unsigned long)dwork);
+
+ /*
+ * If @delay is 0, queue @dwork->work immediately. This is for
+ * both optimization and correctness. The earliest @timer can
+ * expire is on the closest next tick and delayed_work users depend
+ * on that there's no such delay when @delay is 0.
+ */
+ if (!delay) {
+ kthread_insert_work(worker, work, &worker->work_list);
+ return;
+ }
+
+ /* Be paranoid and try to detect possible races already now. */
+ kthread_insert_work_sanity_check(worker, work);
+
+ list_add(&work->node, &worker->delayed_work_list);
+ work->worker = worker;
+ timer->expires = jiffies + delay;
+ add_timer(timer);
+}
+
+/**
+ * kthread_queue_delayed_work - queue the associated kthread work
+ * after a delay.
+ * @worker: target kthread_worker
+ * @dwork: kthread_delayed_work to queue
+ * @delay: number of jiffies to wait before queuing
+ *
+ * If the work has not been pending it starts a timer that will queue
+ * the work after the given @delay. If @delay is zero, it queues the
+ * work immediately.
+ *
+ * Return: %false if the @work has already been pending. It means that
+ * either the timer was running or the work was queued. It returns %true
+ * otherwise.
+ */
+bool kthread_queue_delayed_work(struct kthread_worker *worker,
+ struct kthread_delayed_work *dwork,
+ unsigned long delay)
+{
+ struct kthread_work *work = &dwork->work;
+ unsigned long flags;
+ bool ret = false;
+
+ spin_lock_irqsave(&worker->lock, flags);
+
+ if (!queuing_blocked(worker, work)) {
+ __kthread_queue_delayed_work(worker, dwork, delay);
+ ret = true;
+ }
+
+ spin_unlock_irqrestore(&worker->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);
struct kthread_flush_work {
struct kthread_work work;
@@ -652,12 +902,12 @@ static void kthread_flush_work_fn(struct kthread_work *work)
}
/**
- * flush_kthread_work - flush a kthread_work
+ * kthread_flush_work - flush a kthread_work
* @work: work to flush
*
* If @work is queued or executing, wait for it to finish execution.
*/
-void flush_kthread_work(struct kthread_work *work)
+void kthread_flush_work(struct kthread_work *work)
{
struct kthread_flush_work fwork = {
KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
@@ -666,21 +916,19 @@ void flush_kthread_work(struct kthread_work *work)
struct kthread_worker *worker;
bool noop = false;
-retry:
worker = work->worker;
if (!worker)
return;
spin_lock_irq(&worker->lock);
- if (work->worker != worker) {
- spin_unlock_irq(&worker->lock);
- goto retry;
- }
+ /* Work must not be used with >1 worker, see kthread_queue_work(). */
+ WARN_ON_ONCE(work->worker != worker);
if (!list_empty(&work->node))
- insert_kthread_work(worker, &fwork.work, work->node.next);
+ kthread_insert_work(worker, &fwork.work, work->node.next);
else if (worker->current_work == work)
- insert_kthread_work(worker, &fwork.work, worker->work_list.next);
+ kthread_insert_work(worker, &fwork.work,
+ worker->work_list.next);
else
noop = true;
@@ -689,23 +937,214 @@ retry:
if (!noop)
wait_for_completion(&fwork.done);
}
-EXPORT_SYMBOL_GPL(flush_kthread_work);
+EXPORT_SYMBOL_GPL(kthread_flush_work);
+
+/*
+ * This function removes the work from the worker queue. Also it makes sure
+ * that it won't get queued later via the delayed work's timer.
+ *
+ * The work might still be in use when this function finishes. See the
+ * current_work proceed by the worker.
+ *
+ * Return: %true if @work was pending and successfully canceled,
+ * %false if @work was not pending
+ */
+static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork,
+ unsigned long *flags)
+{
+ /* Try to cancel the timer if exists. */
+ if (is_dwork) {
+ struct kthread_delayed_work *dwork =
+ container_of(work, struct kthread_delayed_work, work);
+ struct kthread_worker *worker = work->worker;
+
+ /*
+ * del_timer_sync() must be called to make sure that the timer
+ * callback is not running. The lock must be temporary released
+ * to avoid a deadlock with the callback. In the meantime,
+ * any queuing is blocked by setting the canceling counter.
+ */
+ work->canceling++;
+ spin_unlock_irqrestore(&worker->lock, *flags);
+ del_timer_sync(&dwork->timer);
+ spin_lock_irqsave(&worker->lock, *flags);
+ work->canceling--;
+ }
+
+ /*
+ * Try to remove the work from a worker list. It might either
+ * be from worker->work_list or from worker->delayed_work_list.
+ */
+ if (!list_empty(&work->node)) {
+ list_del_init(&work->node);
+ return true;
+ }
+
+ return false;
+}
/**
- * flush_kthread_worker - flush all current works on a kthread_worker
+ * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work
+ * @worker: kthread worker to use
+ * @dwork: kthread delayed work to queue
+ * @delay: number of jiffies to wait before queuing
+ *
+ * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise,
+ * modify @dwork's timer so that it expires after @delay. If @delay is zero,
+ * @work is guaranteed to be queued immediately.
+ *
+ * Return: %true if @dwork was pending and its timer was modified,
+ * %false otherwise.
+ *
+ * A special case is when the work is being canceled in parallel.
+ * It might be caused either by the real kthread_cancel_delayed_work_sync()
+ * or yet another kthread_mod_delayed_work() call. We let the other command
+ * win and return %false here. The caller is supposed to synchronize these
+ * operations a reasonable way.
+ *
+ * This function is safe to call from any context including IRQ handler.
+ * See __kthread_cancel_work() and kthread_delayed_work_timer_fn()
+ * for details.
+ */
+bool kthread_mod_delayed_work(struct kthread_worker *worker,
+ struct kthread_delayed_work *dwork,
+ unsigned long delay)
+{
+ struct kthread_work *work = &dwork->work;
+ unsigned long flags;
+ int ret = false;
+
+ spin_lock_irqsave(&worker->lock, flags);
+
+ /* Do not bother with canceling when never queued. */
+ if (!work->worker)
+ goto fast_queue;
+
+ /* Work must not be used with >1 worker, see kthread_queue_work() */
+ WARN_ON_ONCE(work->worker != worker);
+
+ /* Do not fight with another command that is canceling this work. */
+ if (work->canceling)
+ goto out;
+
+ ret = __kthread_cancel_work(work, true, &flags);
+fast_queue:
+ __kthread_queue_delayed_work(worker, dwork, delay);
+out:
+ spin_unlock_irqrestore(&worker->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);
+
+static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
+{
+ struct kthread_worker *worker = work->worker;
+ unsigned long flags;
+ int ret = false;
+
+ if (!worker)
+ goto out;
+
+ spin_lock_irqsave(&worker->lock, flags);
+ /* Work must not be used with >1 worker, see kthread_queue_work(). */
+ WARN_ON_ONCE(work->worker != worker);
+
+ ret = __kthread_cancel_work(work, is_dwork, &flags);
+
+ if (worker->current_work != work)
+ goto out_fast;
+
+ /*
+ * The work is in progress and we need to wait with the lock released.
+ * In the meantime, block any queuing by setting the canceling counter.
+ */
+ work->canceling++;
+ spin_unlock_irqrestore(&worker->lock, flags);
+ kthread_flush_work(work);
+ spin_lock_irqsave(&worker->lock, flags);
+ work->canceling--;
+
+out_fast:
+ spin_unlock_irqrestore(&worker->lock, flags);
+out:
+ return ret;
+}
+
+/**
+ * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish
+ * @work: the kthread work to cancel
+ *
+ * Cancel @work and wait for its execution to finish. This function
+ * can be used even if the work re-queues itself. On return from this
+ * function, @work is guaranteed to be not pending or executing on any CPU.
+ *
+ * kthread_cancel_work_sync(&delayed_work->work) must not be used for
+ * delayed_work's. Use kthread_cancel_delayed_work_sync() instead.
+ *
+ * The caller must ensure that the worker on which @work was last
+ * queued can't be destroyed before this function returns.
+ *
+ * Return: %true if @work was pending, %false otherwise.
+ */
+bool kthread_cancel_work_sync(struct kthread_work *work)
+{
+ return __kthread_cancel_work_sync(work, false);
+}
+EXPORT_SYMBOL_GPL(kthread_cancel_work_sync);
+
+/**
+ * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and
+ * wait for it to finish.
+ * @dwork: the kthread delayed work to cancel
+ *
+ * This is kthread_cancel_work_sync() for delayed works.
+ *
+ * Return: %true if @dwork was pending, %false otherwise.
+ */
+bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork)
+{
+ return __kthread_cancel_work_sync(&dwork->work, true);
+}
+EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync);
+
+/**
+ * kthread_flush_worker - flush all current works on a kthread_worker
* @worker: worker to flush
*
* Wait until all currently executing or pending works on @worker are
* finished.
*/
-void flush_kthread_worker(struct kthread_worker *worker)
+void kthread_flush_worker(struct kthread_worker *worker)
{
struct kthread_flush_work fwork = {
KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
COMPLETION_INITIALIZER_ONSTACK(fwork.done),
};
- queue_kthread_work(worker, &fwork.work);
+ kthread_queue_work(worker, &fwork.work);
wait_for_completion(&fwork.done);
}
-EXPORT_SYMBOL_GPL(flush_kthread_worker);
+EXPORT_SYMBOL_GPL(kthread_flush_worker);
+
+/**
+ * kthread_destroy_worker - destroy a kthread worker
+ * @worker: worker to be destroyed
+ *
+ * Flush and destroy @worker. The simple flush is enough because the kthread
+ * worker API is used only in trivial scenarios. There are no multi-step state
+ * machines needed.
+ */
+void kthread_destroy_worker(struct kthread_worker *worker)
+{
+ struct task_struct *task;
+
+ task = worker->task;
+ if (WARN_ON(!task))
+ return;
+
+ kthread_flush_worker(worker);
+ kthread_stop(task);
+ WARN_ON(!list_empty(&worker->work_list));
+ kfree(worker);
+}
+EXPORT_SYMBOL(kthread_destroy_worker);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 8bbe50704621..af4643873e71 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -274,7 +274,6 @@ static int klp_write_object_relocations(struct module *pmod,
objname = klp_is_module(obj) ? obj->name : "vmlinux";
- module_disable_ro(pmod);
/* For each klp relocation section */
for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) {
sec = pmod->klp_info->sechdrs + i;
@@ -309,7 +308,6 @@ static int klp_write_object_relocations(struct module *pmod,
break;
}
- module_enable_ro(pmod, true);
return ret;
}
@@ -547,9 +545,6 @@ static int __klp_enable_patch(struct klp_patch *patch)
list_prev_entry(patch, list)->state == KLP_DISABLED)
return -EBUSY;
- pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n");
- add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
-
pr_notice("enabling patch '%s'\n", patch->mod->name);
klp_for_each_object(patch, obj) {
@@ -763,6 +758,12 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
func->old_sympos ? func->old_sympos : 1);
}
+/* Arches may override this to finish any remaining arch-specific tasks */
+void __weak arch_klp_init_object_loaded(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+}
+
/* parts of the initialization that is done only when the object is loaded */
static int klp_init_object_loaded(struct klp_patch *patch,
struct klp_object *obj)
@@ -770,9 +771,15 @@ static int klp_init_object_loaded(struct klp_patch *patch,
struct klp_func *func;
int ret;
+ module_disable_ro(patch->mod);
ret = klp_write_object_relocations(patch->mod, obj);
- if (ret)
+ if (ret) {
+ module_enable_ro(patch->mod, true);
return ret;
+ }
+
+ arch_klp_init_object_loaded(patch, obj);
+ module_enable_ro(patch->mod, true);
klp_for_each_func(obj, func) {
ret = klp_find_object_symbol(obj->name, func->old_name,
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 31322a4275cd..760158d9d98d 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -18,7 +18,6 @@ obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
endif
obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
-obj-$(CONFIG_SMP) += lglock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
@@ -29,3 +28,4 @@ obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
+obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
deleted file mode 100644
index 951cfcd10b4a..000000000000
--- a/kernel/locking/lglock.c
+++ /dev/null
@@ -1,111 +0,0 @@
-/* See include/linux/lglock.h for description */
-#include <linux/module.h>
-#include <linux/lglock.h>
-#include <linux/cpu.h>
-#include <linux/string.h>
-
-/*
- * Note there is no uninit, so lglocks cannot be defined in
- * modules (but it's fine to use them from there)
- * Could be added though, just undo lg_lock_init
- */
-
-void lg_lock_init(struct lglock *lg, char *name)
-{
- LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
-}
-EXPORT_SYMBOL(lg_lock_init);
-
-void lg_local_lock(struct lglock *lg)
-{
- arch_spinlock_t *lock;
-
- preempt_disable();
- lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- lock = this_cpu_ptr(lg->lock);
- arch_spin_lock(lock);
-}
-EXPORT_SYMBOL(lg_local_lock);
-
-void lg_local_unlock(struct lglock *lg)
-{
- arch_spinlock_t *lock;
-
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- lock = this_cpu_ptr(lg->lock);
- arch_spin_unlock(lock);
- preempt_enable();
-}
-EXPORT_SYMBOL(lg_local_unlock);
-
-void lg_local_lock_cpu(struct lglock *lg, int cpu)
-{
- arch_spinlock_t *lock;
-
- preempt_disable();
- lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- lock = per_cpu_ptr(lg->lock, cpu);
- arch_spin_lock(lock);
-}
-EXPORT_SYMBOL(lg_local_lock_cpu);
-
-void lg_local_unlock_cpu(struct lglock *lg, int cpu)
-{
- arch_spinlock_t *lock;
-
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- lock = per_cpu_ptr(lg->lock, cpu);
- arch_spin_unlock(lock);
- preempt_enable();
-}
-EXPORT_SYMBOL(lg_local_unlock_cpu);
-
-void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
-{
- BUG_ON(cpu1 == cpu2);
-
- /* lock in cpu order, just like lg_global_lock */
- if (cpu2 < cpu1)
- swap(cpu1, cpu2);
-
- preempt_disable();
- lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
- arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
-}
-
-void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
-{
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
- arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
- preempt_enable();
-}
-
-void lg_global_lock(struct lglock *lg)
-{
- int i;
-
- preempt_disable();
- lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- for_each_possible_cpu(i) {
- arch_spinlock_t *lock;
- lock = per_cpu_ptr(lg->lock, i);
- arch_spin_lock(lock);
- }
-}
-EXPORT_SYMBOL(lg_global_lock);
-
-void lg_global_unlock(struct lglock *lg)
-{
- int i;
-
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- for_each_possible_cpu(i) {
- arch_spinlock_t *lock;
- lock = per_cpu_ptr(lg->lock, i);
- arch_spin_unlock(lock);
- }
- preempt_enable();
-}
-EXPORT_SYMBOL(lg_global_unlock);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 589d763a49b3..9812e5dd409e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -506,13 +506,13 @@ static void __print_lock_name(struct lock_class *class)
name = class->name;
if (!name) {
name = __get_key_name(class->key, str);
- printk("%s", name);
+ printk(KERN_CONT "%s", name);
} else {
- printk("%s", name);
+ printk(KERN_CONT "%s", name);
if (class->name_version > 1)
- printk("#%d", class->name_version);
+ printk(KERN_CONT "#%d", class->name_version);
if (class->subclass)
- printk("/%d", class->subclass);
+ printk(KERN_CONT "/%d", class->subclass);
}
}
@@ -522,9 +522,9 @@ static void print_lock_name(struct lock_class *class)
get_usage_chars(class, usage);
- printk(" (");
+ printk(KERN_CONT " (");
__print_lock_name(class);
- printk("){%s}", usage);
+ printk(KERN_CONT "){%s}", usage);
}
static void print_lockdep_cache(struct lockdep_map *lock)
@@ -536,7 +536,7 @@ static void print_lockdep_cache(struct lockdep_map *lock)
if (!name)
name = __get_key_name(lock->key->subkeys, str);
- printk("%s", name);
+ printk(KERN_CONT "%s", name);
}
static void print_lock(struct held_lock *hlock)
@@ -551,13 +551,13 @@ static void print_lock(struct held_lock *hlock)
barrier();
if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) {
- printk("<RELEASED>\n");
+ printk(KERN_CONT "<RELEASED>\n");
return;
}
print_lock_name(lock_classes + class_idx - 1);
- printk(", at: ");
- print_ip_sym(hlock->acquire_ip);
+ printk(KERN_CONT ", at: [<%p>] %pS\n",
+ (void *)hlock->acquire_ip, (void *)hlock->acquire_ip);
}
static void lockdep_print_held_locks(struct task_struct *curr)
@@ -792,8 +792,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
printk("\nnew class %p: %s", class->key, class->name);
if (class->name_version > 1)
- printk("#%d", class->name_version);
- printk("\n");
+ printk(KERN_CONT "#%d", class->name_version);
+ printk(KERN_CONT "\n");
dump_stack();
if (!graph_lock()) {
@@ -840,9 +840,9 @@ static struct lock_list *alloc_list_entry(void)
/*
* Add a new dependency to the head of the list:
*/
-static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
- struct list_head *head, unsigned long ip,
- int distance, struct stack_trace *trace)
+static int add_lock_to_list(struct lock_class *this, struct list_head *head,
+ unsigned long ip, int distance,
+ struct stack_trace *trace)
{
struct lock_list *entry;
/*
@@ -1071,7 +1071,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)
return 0;
printk("\n-> #%u", depth);
print_lock_name(target->class);
- printk(":\n");
+ printk(KERN_CONT ":\n");
print_stack_trace(&target->trace, 6);
return 0;
@@ -1102,11 +1102,11 @@ print_circular_lock_scenario(struct held_lock *src,
if (parent != source) {
printk("Chain exists of:\n ");
__print_lock_name(source);
- printk(" --> ");
+ printk(KERN_CONT " --> ");
__print_lock_name(parent);
- printk(" --> ");
+ printk(KERN_CONT " --> ");
__print_lock_name(target);
- printk("\n\n");
+ printk(KERN_CONT "\n\n");
}
printk(" Possible unsafe locking scenario:\n\n");
@@ -1114,16 +1114,16 @@ print_circular_lock_scenario(struct held_lock *src,
printk(" ---- ----\n");
printk(" lock(");
__print_lock_name(target);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk(" lock(");
__print_lock_name(parent);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk(" lock(");
__print_lock_name(target);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk(" lock(");
__print_lock_name(source);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
@@ -1359,22 +1359,22 @@ static void print_lock_class_header(struct lock_class *class, int depth)
printk("%*s->", depth, "");
print_lock_name(class);
- printk(" ops: %lu", class->ops);
- printk(" {\n");
+ printk(KERN_CONT " ops: %lu", class->ops);
+ printk(KERN_CONT " {\n");
for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
if (class->usage_mask & (1 << bit)) {
int len = depth;
len += printk("%*s %s", depth, "", usage_str[bit]);
- len += printk(" at:\n");
+ len += printk(KERN_CONT " at:\n");
print_stack_trace(class->usage_traces + bit, len);
}
}
printk("%*s }\n", depth, "");
- printk("%*s ... key at: ",depth,"");
- print_ip_sym((unsigned long)class->key);
+ printk("%*s ... key at: [<%p>] %pS\n",
+ depth, "", class->key, class->key);
}
/*
@@ -1437,11 +1437,11 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
if (middle_class != unsafe_class) {
printk("Chain exists of:\n ");
__print_lock_name(safe_class);
- printk(" --> ");
+ printk(KERN_CONT " --> ");
__print_lock_name(middle_class);
- printk(" --> ");
+ printk(KERN_CONT " --> ");
__print_lock_name(unsafe_class);
- printk("\n\n");
+ printk(KERN_CONT "\n\n");
}
printk(" Possible interrupt unsafe locking scenario:\n\n");
@@ -1449,18 +1449,18 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
printk(" ---- ----\n");
printk(" lock(");
__print_lock_name(unsafe_class);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk(" local_irq_disable();\n");
printk(" lock(");
__print_lock_name(safe_class);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk(" lock(");
__print_lock_name(middle_class);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk(" <Interrupt>\n");
printk(" lock(");
__print_lock_name(safe_class);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
@@ -1497,9 +1497,9 @@ print_bad_irq_dependency(struct task_struct *curr,
print_lock(prev);
printk("which would create a new lock dependency:\n");
print_lock_name(hlock_class(prev));
- printk(" ->");
+ printk(KERN_CONT " ->");
print_lock_name(hlock_class(next));
- printk("\n");
+ printk(KERN_CONT "\n");
printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
irqclass);
@@ -1521,8 +1521,7 @@ print_bad_irq_dependency(struct task_struct *curr,
lockdep_print_held_locks(curr);
- printk("\nthe dependencies between %s-irq-safe lock", irqclass);
- printk(" and the holding lock:\n");
+ printk("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
if (!save_trace(&prev_root->trace))
return 0;
print_shortest_lock_dependencies(backwards_entry, prev_root);
@@ -1694,10 +1693,10 @@ print_deadlock_scenario(struct held_lock *nxt,
printk(" ----\n");
printk(" lock(");
__print_lock_name(prev);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk(" lock(");
__print_lock_name(next);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
printk(" May be due to missing lock nesting notation\n\n");
}
@@ -1869,14 +1868,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* Ok, all validations passed, add the new lock
* to the previous lock's dependency list:
*/
- ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
+ ret = add_lock_to_list(hlock_class(next),
&hlock_class(prev)->locks_after,
next->acquire_ip, distance, &trace);
if (!ret)
return 0;
- ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
+ ret = add_lock_to_list(hlock_class(prev),
&hlock_class(next)->locks_before,
next->acquire_ip, distance, &trace);
if (!ret)
@@ -1891,9 +1890,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
graph_unlock();
printk("\n new dependency: ");
print_lock_name(hlock_class(prev));
- printk(" => ");
+ printk(KERN_CONT " => ");
print_lock_name(hlock_class(next));
- printk("\n");
+ printk(KERN_CONT "\n");
dump_stack();
return graph_lock();
}
@@ -2204,7 +2203,7 @@ cache_hit:
* Important for check_no_collision().
*/
if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) {
- if (debug_locks_off_graph_unlock())
+ if (!debug_locks_off_graph_unlock())
return 0;
print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
@@ -2343,11 +2342,11 @@ print_usage_bug_scenario(struct held_lock *lock)
printk(" ----\n");
printk(" lock(");
__print_lock_name(class);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk(" <Interrupt>\n");
printk(" lock(");
__print_lock_name(class);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
@@ -2522,14 +2521,18 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
void print_irqtrace_events(struct task_struct *curr)
{
printk("irq event stamp: %u\n", curr->irq_events);
- printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event);
- print_ip_sym(curr->hardirq_enable_ip);
- printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event);
- print_ip_sym(curr->hardirq_disable_ip);
- printk("softirqs last enabled at (%u): ", curr->softirq_enable_event);
- print_ip_sym(curr->softirq_enable_ip);
- printk("softirqs last disabled at (%u): ", curr->softirq_disable_event);
- print_ip_sym(curr->softirq_disable_ip);
+ printk("hardirqs last enabled at (%u): [<%p>] %pS\n",
+ curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip,
+ (void *)curr->hardirq_enable_ip);
+ printk("hardirqs last disabled at (%u): [<%p>] %pS\n",
+ curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip,
+ (void *)curr->hardirq_disable_ip);
+ printk("softirqs last enabled at (%u): [<%p>] %pS\n",
+ curr->softirq_enable_event, (void *)curr->softirq_enable_ip,
+ (void *)curr->softirq_enable_ip);
+ printk("softirqs last disabled at (%u): [<%p>] %pS\n",
+ curr->softirq_disable_event, (void *)curr->softirq_disable_ip,
+ (void *)curr->softirq_disable_ip);
}
static int HARDIRQ_verbose(struct lock_class *class)
@@ -3188,7 +3191,7 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
return 0;
}
-static int __lock_is_held(struct lockdep_map *lock);
+static int __lock_is_held(struct lockdep_map *lock, int read);
/*
* This gets called for every mutex_lock*()/spin_lock*() operation.
@@ -3235,8 +3238,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (very_verbose(class)) {
printk("\nacquire class [%p] %s", class->key, class->name);
if (class->name_version > 1)
- printk("#%d", class->name_version);
- printk("\n");
+ printk(KERN_CONT "#%d", class->name_version);
+ printk(KERN_CONT "\n");
dump_stack();
}
@@ -3329,7 +3332,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
}
chain_key = iterate_chain_key(chain_key, class_idx);
- if (nest_lock && !__lock_is_held(nest_lock))
+ if (nest_lock && !__lock_is_held(nest_lock, -1))
return print_lock_nested_lock_not_held(curr, hlock, ip);
if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
@@ -3378,7 +3381,7 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
printk("%s/%d is trying to release lock (",
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
- printk(") at:\n");
+ printk(KERN_CONT ") at:\n");
print_ip_sym(ip);
printk("but there are no more locks to release!\n");
printk("\nother info that might help us debug this:\n");
@@ -3576,7 +3579,7 @@ found_it:
return 1;
}
-static int __lock_is_held(struct lockdep_map *lock)
+static int __lock_is_held(struct lockdep_map *lock, int read)
{
struct task_struct *curr = current;
int i;
@@ -3584,8 +3587,12 @@ static int __lock_is_held(struct lockdep_map *lock)
for (i = 0; i < curr->lockdep_depth; i++) {
struct held_lock *hlock = curr->held_locks + i;
- if (match_held_lock(hlock, lock))
- return 1;
+ if (match_held_lock(hlock, lock)) {
+ if (read == -1 || hlock->read == read)
+ return 1;
+
+ return 0;
+ }
}
return 0;
@@ -3769,7 +3776,7 @@ void lock_release(struct lockdep_map *lock, int nested,
}
EXPORT_SYMBOL_GPL(lock_release);
-int lock_is_held(struct lockdep_map *lock)
+int lock_is_held_type(struct lockdep_map *lock, int read)
{
unsigned long flags;
int ret = 0;
@@ -3781,13 +3788,13 @@ int lock_is_held(struct lockdep_map *lock)
check_flags(flags);
current->lockdep_recursion = 1;
- ret = __lock_is_held(lock);
+ ret = __lock_is_held(lock, read);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
return ret;
}
-EXPORT_SYMBOL_GPL(lock_is_held);
+EXPORT_SYMBOL_GPL(lock_is_held_type);
struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
{
@@ -3871,7 +3878,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
printk("%s/%d is trying to contend lock (",
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
- printk(") at:\n");
+ printk(KERN_CONT ") at:\n");
print_ip_sym(ip);
printk("but there are no locks held!\n");
printk("\nother info that might help us debug this:\n");
@@ -4405,13 +4412,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
/* Note: the following can be executed concurrently, so be careful. */
printk("\n");
- printk("===============================\n");
- printk("[ INFO: suspicious RCU usage. ]\n");
+ pr_err("===============================\n");
+ pr_err("[ ERR: suspicious RCU usage. ]\n");
print_kernel_ident();
- printk("-------------------------------\n");
- printk("%s:%d %s!\n", file, line, s);
- printk("\nother info that might help us debug this:\n\n");
- printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+ pr_err("-------------------------------\n");
+ pr_err("%s:%d %s!\n", file, line, s);
+ pr_err("\nother info that might help us debug this:\n\n");
+ pr_err("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
!rcu_lockdep_current_cpu_online()
? "RCU used illegally from offline CPU!\n"
: !rcu_is_watching()
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 51c4b24b6328..c2b88490d857 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -46,6 +46,14 @@ enum {
(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
/*
+ * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text,
+ * .data and .bss to fit in required 32MB limit for the kernel. With
+ * PROVE_LOCKING we could go over this limit and cause system boot-up problems.
+ * So, reduce the static allocations for lockdeps related structures so that
+ * everything fits in current required size limit.
+ */
+#ifdef CONFIG_PROVE_LOCKING_SMALL
+/*
* MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
* we track.
*
@@ -54,18 +62,24 @@ enum {
* table (if it's not there yet), and we check it for lock order
* conflicts and deadlocks.
*/
+#define MAX_LOCKDEP_ENTRIES 16384UL
+#define MAX_LOCKDEP_CHAINS_BITS 15
+#define MAX_STACK_TRACE_ENTRIES 262144UL
+#else
#define MAX_LOCKDEP_ENTRIES 32768UL
#define MAX_LOCKDEP_CHAINS_BITS 16
-#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
-
-#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
/*
* Stack-trace: tightly packed array of stack backtrace
* addresses. Protected by the hash_lock.
*/
#define MAX_STACK_TRACE_ENTRIES 524288UL
+#endif
+
+#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
+
+#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
extern struct list_head all_lock_classes;
extern struct lock_chain lock_chains[];
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index a0f61effad25..6d1fcc786081 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -18,7 +18,7 @@
#include <linux/debug_locks.h>
#include <linux/vmalloc.h>
#include <linux/sort.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/div64.h>
#include "lockdep_internals.h"
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index f8c5af52a131..28350dc8ecbb 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -372,6 +372,78 @@ static struct lock_torture_ops mutex_lock_ops = {
.name = "mutex_lock"
};
+#include <linux/ww_mutex.h>
+static DEFINE_WW_CLASS(torture_ww_class);
+static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class);
+static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class);
+static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class);
+
+static int torture_ww_mutex_lock(void)
+__acquires(torture_ww_mutex_0)
+__acquires(torture_ww_mutex_1)
+__acquires(torture_ww_mutex_2)
+{
+ LIST_HEAD(list);
+ struct reorder_lock {
+ struct list_head link;
+ struct ww_mutex *lock;
+ } locks[3], *ll, *ln;
+ struct ww_acquire_ctx ctx;
+
+ locks[0].lock = &torture_ww_mutex_0;
+ list_add(&locks[0].link, &list);
+
+ locks[1].lock = &torture_ww_mutex_1;
+ list_add(&locks[1].link, &list);
+
+ locks[2].lock = &torture_ww_mutex_2;
+ list_add(&locks[2].link, &list);
+
+ ww_acquire_init(&ctx, &torture_ww_class);
+
+ list_for_each_entry(ll, &list, link) {
+ int err;
+
+ err = ww_mutex_lock(ll->lock, &ctx);
+ if (!err)
+ continue;
+
+ ln = ll;
+ list_for_each_entry_continue_reverse(ln, &list, link)
+ ww_mutex_unlock(ln->lock);
+
+ if (err != -EDEADLK)
+ return err;
+
+ ww_mutex_lock_slow(ll->lock, &ctx);
+ list_move(&ll->link, &list);
+ }
+
+ ww_acquire_fini(&ctx);
+ return 0;
+}
+
+static void torture_ww_mutex_unlock(void)
+__releases(torture_ww_mutex_0)
+__releases(torture_ww_mutex_1)
+__releases(torture_ww_mutex_2)
+{
+ ww_mutex_unlock(&torture_ww_mutex_0);
+ ww_mutex_unlock(&torture_ww_mutex_1);
+ ww_mutex_unlock(&torture_ww_mutex_2);
+}
+
+static struct lock_torture_ops ww_mutex_lock_ops = {
+ .writelock = torture_ww_mutex_lock,
+ .write_delay = torture_mutex_delay,
+ .task_boost = torture_boost_dummy,
+ .writeunlock = torture_ww_mutex_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "ww_mutex_lock"
+};
+
#ifdef CONFIG_RT_MUTEXES
static DEFINE_RT_MUTEX(torture_rtmutex);
@@ -780,6 +852,10 @@ static void lock_torture_cleanup(void)
else
lock_torture_print_module_parms(cxt.cur_ops,
"End of test: SUCCESS");
+
+ kfree(cxt.lwsa);
+ kfree(cxt.lrsa);
+
end:
torture_cleanup_end();
}
@@ -793,6 +869,7 @@ static int __init lock_torture_init(void)
&spin_lock_ops, &spin_lock_irq_ops,
&rw_lock_ops, &rw_lock_irq_ops,
&mutex_lock_ops,
+ &ww_mutex_lock_ops,
#ifdef CONFIG_RT_MUTEXES
&rtmutex_lock_ops,
#endif
@@ -924,6 +1001,8 @@ static int __init lock_torture_init(void)
GFP_KERNEL);
if (reader_tasks == NULL) {
VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
+ kfree(writer_tasks);
+ writer_tasks = NULL;
firsterr = -ENOMEM;
goto unwind;
}
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index c835270f0c2f..6a385aabcce7 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -28,7 +28,7 @@ struct mcs_spinlock {
#define arch_mcs_spin_lock_contended(l) \
do { \
while (!(smp_load_acquire(l))) \
- cpu_relax_lowlatency(); \
+ cpu_relax(); \
} while (0)
#endif
@@ -108,7 +108,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
return;
/* Wait until the next pointer is set */
while (!(next = READ_ONCE(node->next)))
- cpu_relax_lowlatency();
+ cpu_relax();
}
/* Pass lock to next waiter. */
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 9c951fade415..9aa713629387 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -73,21 +73,8 @@ void debug_mutex_unlock(struct mutex *lock)
{
if (likely(debug_locks)) {
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-
- if (!lock->owner)
- DEBUG_LOCKS_WARN_ON(!lock->owner);
- else
- DEBUG_LOCKS_WARN_ON(lock->owner != current);
-
DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
}
-
- /*
- * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
- * mutexes so that we can do it here after we've verified state.
- */
- mutex_clear_owner(lock);
- atomic_set(&lock->count, 1);
}
void debug_mutex_init(struct mutex *lock, const char *name,
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
index 57a871ae3c81..4174417d5309 100644
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@@ -26,30 +26,3 @@ extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
extern void debug_mutex_unlock(struct mutex *lock);
extern void debug_mutex_init(struct mutex *lock, const char *name,
struct lock_class_key *key);
-
-static inline void mutex_set_owner(struct mutex *lock)
-{
- WRITE_ONCE(lock->owner, current);
-}
-
-static inline void mutex_clear_owner(struct mutex *lock)
-{
- WRITE_ONCE(lock->owner, NULL);
-}
-
-#define spin_lock_mutex(lock, flags) \
- do { \
- struct mutex *l = container_of(lock, struct mutex, wait_lock); \
- \
- DEBUG_LOCKS_WARN_ON(in_interrupt()); \
- local_irq_save(flags); \
- arch_spin_lock(&(lock)->rlock.raw_lock);\
- DEBUG_LOCKS_WARN_ON(l->magic != l); \
- } while (0)
-
-#define spin_unlock_mutex(lock, flags) \
- do { \
- arch_spin_unlock(&(lock)->rlock.raw_lock); \
- local_irq_restore(flags); \
- preempt_check_resched(); \
- } while (0)
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index a70b90db3909..ad2d9e22697b 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -27,41 +27,180 @@
#include <linux/debug_locks.h>
#include <linux/osq_lock.h>
-/*
- * In the DEBUG case we are using the "NULL fastpath" for mutexes,
- * which forces all calls into the slowpath:
- */
#ifdef CONFIG_DEBUG_MUTEXES
# include "mutex-debug.h"
-# include <asm-generic/mutex-null.h>
-/*
- * Must be 0 for the debug case so we do not do the unlock outside of the
- * wait_lock region. debug_mutex_unlock() will do the actual unlock in this
- * case.
- */
-# undef __mutex_slowpath_needs_to_unlock
-# define __mutex_slowpath_needs_to_unlock() 0
#else
# include "mutex.h"
-# include <asm/mutex.h>
#endif
void
__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
{
- atomic_set(&lock->count, 1);
+ atomic_long_set(&lock->owner, 0);
spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
- mutex_clear_owner(lock);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
osq_lock_init(&lock->osq);
#endif
debug_mutex_init(lock, name, key);
}
-
EXPORT_SYMBOL(__mutex_init);
+/*
+ * @owner: contains: 'struct task_struct *' to the current lock owner,
+ * NULL means not owned. Since task_struct pointers are aligned at
+ * at least L1_CACHE_BYTES, we have low bits to store extra state.
+ *
+ * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup.
+ * Bit1 indicates unlock needs to hand the lock to the top-waiter
+ * Bit2 indicates handoff has been done and we're waiting for pickup.
+ */
+#define MUTEX_FLAG_WAITERS 0x01
+#define MUTEX_FLAG_HANDOFF 0x02
+#define MUTEX_FLAG_PICKUP 0x04
+
+#define MUTEX_FLAGS 0x07
+
+static inline struct task_struct *__owner_task(unsigned long owner)
+{
+ return (struct task_struct *)(owner & ~MUTEX_FLAGS);
+}
+
+static inline unsigned long __owner_flags(unsigned long owner)
+{
+ return owner & MUTEX_FLAGS;
+}
+
+/*
+ * Trylock variant that retuns the owning task on failure.
+ */
+static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
+{
+ unsigned long owner, curr = (unsigned long)current;
+
+ owner = atomic_long_read(&lock->owner);
+ for (;;) { /* must loop, can race against a flag */
+ unsigned long old, flags = __owner_flags(owner);
+ unsigned long task = owner & ~MUTEX_FLAGS;
+
+ if (task) {
+ if (likely(task != curr))
+ break;
+
+ if (likely(!(flags & MUTEX_FLAG_PICKUP)))
+ break;
+
+ flags &= ~MUTEX_FLAG_PICKUP;
+ } else {
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(flags & MUTEX_FLAG_PICKUP);
+#endif
+ }
+
+ /*
+ * We set the HANDOFF bit, we must make sure it doesn't live
+ * past the point where we acquire it. This would be possible
+ * if we (accidentally) set the bit on an unlocked mutex.
+ */
+ flags &= ~MUTEX_FLAG_HANDOFF;
+
+ old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags);
+ if (old == owner)
+ return NULL;
+
+ owner = old;
+ }
+
+ return __owner_task(owner);
+}
+
+/*
+ * Actual trylock that will work on any unlocked state.
+ */
+static inline bool __mutex_trylock(struct mutex *lock)
+{
+ return !__mutex_trylock_or_owner(lock);
+}
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * Lockdep annotations are contained to the slow paths for simplicity.
+ * There is nothing that would stop spreading the lockdep annotations outwards
+ * except more code.
+ */
+
+/*
+ * Optimistic trylock that only works in the uncontended case. Make sure to
+ * follow with a __mutex_trylock() before failing.
+ */
+static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
+{
+ unsigned long curr = (unsigned long)current;
+
+ if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr))
+ return true;
+
+ return false;
+}
+
+static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
+{
+ unsigned long curr = (unsigned long)current;
+
+ if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr)
+ return true;
+
+ return false;
+}
+#endif
+
+static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag)
+{
+ atomic_long_or(flag, &lock->owner);
+}
+
+static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag)
+{
+ atomic_long_andnot(flag, &lock->owner);
+}
+
+static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_waiter *waiter)
+{
+ return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter;
+}
+
+/*
+ * Give up ownership to a specific task, when @task = NULL, this is equivalent
+ * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves
+ * WAITERS. Provides RELEASE semantics like a regular unlock, the
+ * __mutex_trylock() provides a matching ACQUIRE semantics for the handoff.
+ */
+static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
+{
+ unsigned long owner = atomic_long_read(&lock->owner);
+
+ for (;;) {
+ unsigned long old, new;
+
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
+ DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
+#endif
+
+ new = (owner & MUTEX_FLAG_WAITERS);
+ new |= (unsigned long)task;
+ if (task)
+ new |= MUTEX_FLAG_PICKUP;
+
+ old = atomic_long_cmpxchg_release(&lock->owner, owner, new);
+ if (old == owner)
+ break;
+
+ owner = old;
+ }
+}
+
#ifndef CONFIG_DEBUG_LOCK_ALLOC
/*
* We split the mutex lock/unlock logic into separate fastpath and
@@ -69,7 +208,7 @@ EXPORT_SYMBOL(__mutex_init);
* We also put the fastpath first in the kernel image, to make sure the
* branch is predicted by the CPU as default-untaken.
*/
-__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
+static void __sched __mutex_lock_slowpath(struct mutex *lock);
/**
* mutex_lock - acquire the mutex
@@ -95,19 +234,15 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
void __sched mutex_lock(struct mutex *lock)
{
might_sleep();
- /*
- * The locking fastpath is the 1->0 transition from
- * 'unlocked' into 'locked' state.
- */
- __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
- mutex_set_owner(lock);
-}
+ if (!__mutex_trylock_fast(lock))
+ __mutex_lock_slowpath(lock);
+}
EXPORT_SYMBOL(mutex_lock);
#endif
-static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
- struct ww_acquire_ctx *ww_ctx)
+static __always_inline void
+ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
{
#ifdef CONFIG_DEBUG_MUTEXES
/*
@@ -146,20 +281,50 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
ww_ctx->acquired++;
}
+static inline bool __sched
+__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
+{
+ return a->stamp - b->stamp <= LONG_MAX &&
+ (a->stamp != b->stamp || a > b);
+}
+
/*
- * After acquiring lock with fastpath or when we lost out in contested
- * slowpath, set ctx and wake up any waiters so they can recheck.
+ * Wake up any waiters that may have to back off when the lock is held by the
+ * given context.
+ *
+ * Due to the invariants on the wait list, this can only affect the first
+ * waiter with a context.
*
- * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
- * as the fastpath and opportunistic spinning are disabled in that case.
+ * The current task must not be on the wait list.
*/
-static __always_inline void
-ww_mutex_set_context_fastpath(struct ww_mutex *lock,
- struct ww_acquire_ctx *ctx)
+static void __sched
+__ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
{
- unsigned long flags;
struct mutex_waiter *cur;
+ lockdep_assert_held(&lock->wait_lock);
+
+ list_for_each_entry(cur, &lock->wait_list, list) {
+ if (!cur->ww_ctx)
+ continue;
+
+ if (cur->ww_ctx->acquired > 0 &&
+ __ww_ctx_stamp_after(cur->ww_ctx, ww_ctx)) {
+ debug_mutex_wake_waiter(lock, cur);
+ wake_up_process(cur->task);
+ }
+
+ break;
+ }
+}
+
+/*
+ * After acquiring lock with fastpath or when we lost out in contested
+ * slowpath, set ctx and wake up any waiters so they can recheck.
+ */
+static __always_inline void
+ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
ww_mutex_lock_acquired(lock, ctx);
lock->ctx = ctx;
@@ -176,58 +341,91 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
/*
* Check if lock is contended, if not there is nobody to wake up
*/
- if (likely(atomic_read(&lock->base.count) == 0))
+ if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS)))
return;
/*
* Uh oh, we raced in fastpath, wake up everyone in this case,
* so they can see the new lock->ctx.
*/
- spin_lock_mutex(&lock->base.wait_lock, flags);
- list_for_each_entry(cur, &lock->base.wait_list, list) {
- debug_mutex_wake_waiter(&lock->base, cur);
- wake_up_process(cur->task);
- }
- spin_unlock_mutex(&lock->base.wait_lock, flags);
+ spin_lock(&lock->base.wait_lock);
+ __ww_mutex_wakeup_for_backoff(&lock->base, ctx);
+ spin_unlock(&lock->base.wait_lock);
}
/*
- * After acquiring lock in the slowpath set ctx and wake up any
- * waiters so they can recheck.
+ * After acquiring lock in the slowpath set ctx.
+ *
+ * Unlike for the fast path, the caller ensures that waiters are woken up where
+ * necessary.
*
* Callers must hold the mutex wait_lock.
*/
static __always_inline void
-ww_mutex_set_context_slowpath(struct ww_mutex *lock,
- struct ww_acquire_ctx *ctx)
+ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
- struct mutex_waiter *cur;
-
ww_mutex_lock_acquired(lock, ctx);
lock->ctx = ctx;
+}
+
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+
+static inline
+bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
+ struct mutex_waiter *waiter)
+{
+ struct ww_mutex *ww;
+
+ ww = container_of(lock, struct ww_mutex, base);
+
+ /*
+ * If ww->ctx is set the contents are undefined, only
+ * by acquiring wait_lock there is a guarantee that
+ * they are not invalid when reading.
+ *
+ * As such, when deadlock detection needs to be
+ * performed the optimistic spinning cannot be done.
+ *
+ * Check this in every inner iteration because we may
+ * be racing against another thread's ww_mutex_lock.
+ */
+ if (ww_ctx->acquired > 0 && READ_ONCE(ww->ctx))
+ return false;
/*
- * Give any possible sleeping processes the chance to wake up,
- * so they can recheck if they have to back off.
+ * If we aren't on the wait list yet, cancel the spin
+ * if there are waiters. We want to avoid stealing the
+ * lock from a waiter with an earlier stamp, since the
+ * other thread may already own a lock that we also
+ * need.
*/
- list_for_each_entry(cur, &lock->base.wait_list, list) {
- debug_mutex_wake_waiter(&lock->base, cur);
- wake_up_process(cur->task);
- }
+ if (!waiter && (atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS))
+ return false;
+
+ /*
+ * Similarly, stop spinning if we are no longer the
+ * first waiter.
+ */
+ if (waiter && !__mutex_waiter_is_first(lock, waiter))
+ return false;
+
+ return true;
}
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
+ * Look out! "owner" is an entirely speculative pointer access and not
+ * reliable.
+ *
+ * "noinline" so that this function shows up on perf profiles.
*/
static noinline
-bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
+ struct ww_acquire_ctx *ww_ctx, struct mutex_waiter *waiter)
{
bool ret = true;
rcu_read_lock();
- while (lock->owner == owner) {
+ while (__mutex_owner(lock) == owner) {
/*
* Ensure we emit the owner->on_cpu, dereference _after_
* checking lock->owner still matches owner. If that fails,
@@ -236,12 +434,21 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
*/
barrier();
- if (!owner->on_cpu || need_resched()) {
+ /*
+ * Use vcpu_is_preempted to detect lock holder preemption issue.
+ */
+ if (!owner->on_cpu || need_resched() ||
+ vcpu_is_preempted(task_cpu(owner))) {
+ ret = false;
+ break;
+ }
+
+ if (ww_ctx && !ww_mutex_spin_on_owner(lock, ww_ctx, waiter)) {
ret = false;
break;
}
- cpu_relax_lowlatency();
+ cpu_relax();
}
rcu_read_unlock();
@@ -260,27 +467,25 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
return 0;
rcu_read_lock();
- owner = READ_ONCE(lock->owner);
+ owner = __mutex_owner(lock);
+
+ /*
+ * As lock holder preemption issue, we both skip spinning if task is not
+ * on cpu or its cpu is preempted
+ */
if (owner)
- retval = owner->on_cpu;
+ retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
rcu_read_unlock();
+
/*
- * if lock->owner is not set, the mutex owner may have just acquired
- * it and not set the owner yet or the mutex has been released.
+ * If lock->owner is not set, the mutex has been released. Return true
+ * such that we'll trylock in the spin path, which is a faster option
+ * than the blocking slow path.
*/
return retval;
}
/*
- * Atomically try to take the lock when it is available
- */
-static inline bool mutex_try_to_acquire(struct mutex *lock)
-{
- return !mutex_is_locked(lock) &&
- (atomic_cmpxchg_acquire(&lock->count, 1, 0) == 1);
-}
-
-/*
* Optimistic spinning.
*
* We try to spin for acquisition when we find that the lock owner
@@ -288,13 +493,6 @@ static inline bool mutex_try_to_acquire(struct mutex *lock)
* need to reschedule. The rationale is that if the lock owner is
* running, it is likely to release the lock soon.
*
- * Since this needs the lock owner, and this mutex implementation
- * doesn't track the owner atomically in the lock field, we need to
- * track it non-atomically.
- *
- * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
- * to serialize everything.
- *
* The mutex spinners are queued up using MCS lock so that only one
* spinner can compete for the mutex. However, if mutex spinning isn't
* going to happen, there is no point in going through the lock/unlock
@@ -302,74 +500,50 @@ static inline bool mutex_try_to_acquire(struct mutex *lock)
*
* Returns true when the lock was taken, otherwise false, indicating
* that we need to jump to the slowpath and sleep.
+ *
+ * The waiter flag is set to true if the spinner is a waiter in the wait
+ * queue. The waiter-spinner will spin on the lock directly and concurrently
+ * with the spinner at the head of the OSQ, if present, until the owner is
+ * changed to itself.
*/
-static bool mutex_optimistic_spin(struct mutex *lock,
- struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+static __always_inline bool
+mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
+ const bool use_ww_ctx, struct mutex_waiter *waiter)
{
- struct task_struct *task = current;
-
- if (!mutex_can_spin_on_owner(lock))
- goto done;
-
- /*
- * In order to avoid a stampede of mutex spinners trying to
- * acquire the mutex all at once, the spinners need to take a
- * MCS (queued) lock first before spinning on the owner field.
- */
- if (!osq_lock(&lock->osq))
- goto done;
-
- while (true) {
- struct task_struct *owner;
-
- if (use_ww_ctx && ww_ctx->acquired > 0) {
- struct ww_mutex *ww;
-
- ww = container_of(lock, struct ww_mutex, base);
- /*
- * If ww->ctx is set the contents are undefined, only
- * by acquiring wait_lock there is a guarantee that
- * they are not invalid when reading.
- *
- * As such, when deadlock detection needs to be
- * performed the optimistic spinning cannot be done.
- */
- if (READ_ONCE(ww->ctx))
- break;
- }
-
+ if (!waiter) {
/*
- * If there's an owner, wait for it to either
- * release the lock or go to sleep.
+ * The purpose of the mutex_can_spin_on_owner() function is
+ * to eliminate the overhead of osq_lock() and osq_unlock()
+ * in case spinning isn't possible. As a waiter-spinner
+ * is not going to take OSQ lock anyway, there is no need
+ * to call mutex_can_spin_on_owner().
*/
- owner = READ_ONCE(lock->owner);
- if (owner && !mutex_spin_on_owner(lock, owner))
- break;
+ if (!mutex_can_spin_on_owner(lock))
+ goto fail;
- /* Try to acquire the mutex if it is unlocked. */
- if (mutex_try_to_acquire(lock)) {
- lock_acquired(&lock->dep_map, ip);
-
- if (use_ww_ctx) {
- struct ww_mutex *ww;
- ww = container_of(lock, struct ww_mutex, base);
+ /*
+ * In order to avoid a stampede of mutex spinners trying to
+ * acquire the mutex all at once, the spinners need to take a
+ * MCS (queued) lock first before spinning on the owner field.
+ */
+ if (!osq_lock(&lock->osq))
+ goto fail;
+ }
- ww_mutex_set_context_fastpath(ww, ww_ctx);
- }
+ for (;;) {
+ struct task_struct *owner;
- mutex_set_owner(lock);
- osq_unlock(&lock->osq);
- return true;
- }
+ /* Try to acquire the mutex... */
+ owner = __mutex_trylock_or_owner(lock);
+ if (!owner)
+ break;
/*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
- * the owner complete.
+ * There's an owner, wait for it to either
+ * release the lock or go to sleep.
*/
- if (!owner && (need_resched() || rt_task(task)))
- break;
+ if (!mutex_spin_on_owner(lock, owner, ww_ctx, waiter))
+ goto fail_unlock;
/*
* The cpu_relax() call is a compiler barrier which forces
@@ -377,11 +551,20 @@ static bool mutex_optimistic_spin(struct mutex *lock,
* memory barriers as we'll eventually observe the right
* values at the cost of a few extra spins.
*/
- cpu_relax_lowlatency();
+ cpu_relax();
}
- osq_unlock(&lock->osq);
-done:
+ if (!waiter)
+ osq_unlock(&lock->osq);
+
+ return true;
+
+
+fail_unlock:
+ if (!waiter)
+ osq_unlock(&lock->osq);
+
+fail:
/*
* If we fell out of the spin path because of need_resched(),
* reschedule now, before we try-lock the mutex. This avoids getting
@@ -399,15 +582,15 @@ done:
return false;
}
#else
-static bool mutex_optimistic_spin(struct mutex *lock,
- struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+static __always_inline bool
+mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
+ const bool use_ww_ctx, struct mutex_waiter *waiter)
{
return false;
}
#endif
-__visible __used noinline
-void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
+static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip);
/**
* mutex_unlock - release the mutex
@@ -422,21 +605,12 @@ void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
*/
void __sched mutex_unlock(struct mutex *lock)
{
- /*
- * The unlocking fastpath is the 0->1 transition from 'locked'
- * into 'unlocked' state:
- */
-#ifndef CONFIG_DEBUG_MUTEXES
- /*
- * When debugging is enabled we must not clear the owner before time,
- * the slow path will always be taken, and that clears the owner field
- * after verifying that it was indeed current.
- */
- mutex_clear_owner(lock);
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+ if (__mutex_unlock_fast(lock))
+ return;
#endif
- __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
+ __mutex_unlock_slowpath(lock, _RET_IP_);
}
-
EXPORT_SYMBOL(mutex_unlock);
/**
@@ -465,36 +639,93 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
lock->ctx = NULL;
}
-#ifndef CONFIG_DEBUG_MUTEXES
- /*
- * When debugging is enabled we must not clear the owner before time,
- * the slow path will always be taken, and that clears the owner field
- * after verifying that it was indeed current.
- */
- mutex_clear_owner(&lock->base);
-#endif
- __mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath);
+ mutex_unlock(&lock->base);
}
EXPORT_SYMBOL(ww_mutex_unlock);
static inline int __sched
-__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
+__ww_mutex_lock_check_stamp(struct mutex *lock, struct mutex_waiter *waiter,
+ struct ww_acquire_ctx *ctx)
{
struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
+ struct mutex_waiter *cur;
+
+ if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx))
+ goto deadlock;
+
+ /*
+ * If there is a waiter in front of us that has a context, then its
+ * stamp is earlier than ours and we must back off.
+ */
+ cur = waiter;
+ list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) {
+ if (cur->ww_ctx)
+ goto deadlock;
+ }
- if (!hold_ctx)
+ return 0;
+
+deadlock:
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
+ ctx->contending_lock = ww;
+#endif
+ return -EDEADLK;
+}
+
+static inline int __sched
+__ww_mutex_add_waiter(struct mutex_waiter *waiter,
+ struct mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ struct mutex_waiter *cur;
+ struct list_head *pos;
+
+ if (!ww_ctx) {
+ list_add_tail(&waiter->list, &lock->wait_list);
return 0;
+ }
- if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
- (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
+ /*
+ * Add the waiter before the first waiter with a higher stamp.
+ * Waiters without a context are skipped to avoid starving
+ * them.
+ */
+ pos = &lock->wait_list;
+ list_for_each_entry_reverse(cur, &lock->wait_list, list) {
+ if (!cur->ww_ctx)
+ continue;
+
+ if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) {
+ /* Back off immediately if necessary. */
+ if (ww_ctx->acquired > 0) {
#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
- ctx->contending_lock = ww;
+ struct ww_mutex *ww;
+
+ ww = container_of(lock, struct ww_mutex, base);
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
+ ww_ctx->contending_lock = ww;
#endif
- return -EDEADLK;
+ return -EDEADLK;
+ }
+
+ break;
+ }
+
+ pos = &cur->list;
+
+ /*
+ * Wake up the waiter so that it gets a chance to back
+ * off.
+ */
+ if (cur->ww_ctx->acquired > 0) {
+ debug_mutex_wake_waiter(lock, cur);
+ wake_up_process(cur->task);
+ }
}
+ list_add_tail(&waiter->list, pos);
return 0;
}
@@ -506,13 +737,15 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
struct lockdep_map *nest_lock, unsigned long ip,
struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
{
- struct task_struct *task = current;
struct mutex_waiter waiter;
- unsigned long flags;
+ bool first = false;
+ struct ww_mutex *ww;
int ret;
- if (use_ww_ctx) {
- struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+ might_sleep();
+
+ ww = container_of(lock, struct ww_mutex, base);
+ if (use_ww_ctx && ww_ctx) {
if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
return -EALREADY;
}
@@ -520,106 +753,157 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
preempt_disable();
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
- if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
+ if (__mutex_trylock(lock) ||
+ mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) {
/* got the lock, yay! */
+ lock_acquired(&lock->dep_map, ip);
+ if (use_ww_ctx && ww_ctx)
+ ww_mutex_set_context_fastpath(ww, ww_ctx);
preempt_enable();
return 0;
}
- spin_lock_mutex(&lock->wait_lock, flags);
-
+ spin_lock(&lock->wait_lock);
/*
- * Once more, try to acquire the lock. Only try-lock the mutex if
- * it is unlocked to reduce unnecessary xchg() operations.
+ * After waiting to acquire the wait_lock, try again.
*/
- if (!mutex_is_locked(lock) &&
- (atomic_xchg_acquire(&lock->count, 0) == 1))
+ if (__mutex_trylock(lock)) {
+ if (use_ww_ctx && ww_ctx)
+ __ww_mutex_wakeup_for_backoff(lock, ww_ctx);
+
goto skip_wait;
+ }
debug_mutex_lock_common(lock, &waiter);
- debug_mutex_add_waiter(lock, &waiter, task);
-
- /* add waiting tasks to the end of the waitqueue (FIFO): */
- list_add_tail(&waiter.list, &lock->wait_list);
- waiter.task = task;
+ debug_mutex_add_waiter(lock, &waiter, current);
lock_contended(&lock->dep_map, ip);
+ if (!use_ww_ctx) {
+ /* add waiting tasks to the end of the waitqueue (FIFO): */
+ list_add_tail(&waiter.list, &lock->wait_list);
+
+#ifdef CONFIG_DEBUG_MUTEXES
+ waiter.ww_ctx = MUTEX_POISON_WW_CTX;
+#endif
+ } else {
+ /* Add in stamp order, waking up waiters that must back off. */
+ ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
+ if (ret)
+ goto err_early_backoff;
+
+ waiter.ww_ctx = ww_ctx;
+ }
+
+ waiter.task = current;
+
+ if (__mutex_waiter_is_first(lock, &waiter))
+ __mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
+
+ set_current_state(state);
for (;;) {
/*
- * Lets try to take the lock again - this is needed even if
- * we get here for the first time (shortly after failing to
- * acquire the lock), to make sure that we get a wakeup once
- * it's unlocked. Later on, if we sleep, this is the
- * operation that gives us the lock. We xchg it to -1, so
- * that when we release the lock, we properly wake up the
- * other waiters. We only attempt the xchg if the count is
- * non-negative in order to avoid unnecessary xchg operations:
+ * Once we hold wait_lock, we're serialized against
+ * mutex_unlock() handing the lock off to us, do a trylock
+ * before testing the error conditions to make sure we pick up
+ * the handoff.
*/
- if (atomic_read(&lock->count) >= 0 &&
- (atomic_xchg_acquire(&lock->count, -1) == 1))
- break;
+ if (__mutex_trylock(lock))
+ goto acquired;
/*
- * got a signal? (This code gets eliminated in the
- * TASK_UNINTERRUPTIBLE case.)
+ * Check for signals and wound conditions while holding
+ * wait_lock. This ensures the lock cancellation is ordered
+ * against mutex_unlock() and wake-ups do not go missing.
*/
- if (unlikely(signal_pending_state(state, task))) {
+ if (unlikely(signal_pending_state(state, current))) {
ret = -EINTR;
goto err;
}
- if (use_ww_ctx && ww_ctx->acquired > 0) {
- ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
+ if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) {
+ ret = __ww_mutex_lock_check_stamp(lock, &waiter, ww_ctx);
if (ret)
goto err;
}
- __set_task_state(task, state);
-
- /* didn't get the lock, go to sleep: */
- spin_unlock_mutex(&lock->wait_lock, flags);
+ spin_unlock(&lock->wait_lock);
schedule_preempt_disabled();
- spin_lock_mutex(&lock->wait_lock, flags);
+
+ /*
+ * ww_mutex needs to always recheck its position since its waiter
+ * list is not FIFO ordered.
+ */
+ if ((use_ww_ctx && ww_ctx) || !first) {
+ first = __mutex_waiter_is_first(lock, &waiter);
+ if (first)
+ __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
+ }
+
+ set_current_state(state);
+ /*
+ * Here we order against unlock; we must either see it change
+ * state back to RUNNING and fall through the next schedule(),
+ * or we must see its unlock and acquire.
+ */
+ if (__mutex_trylock(lock) ||
+ (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter)))
+ break;
+
+ spin_lock(&lock->wait_lock);
}
- __set_task_state(task, TASK_RUNNING);
+ spin_lock(&lock->wait_lock);
+acquired:
+ __set_current_state(TASK_RUNNING);
- mutex_remove_waiter(lock, &waiter, task);
- /* set it to 0 if there are no waiters left: */
+ mutex_remove_waiter(lock, &waiter, current);
if (likely(list_empty(&lock->wait_list)))
- atomic_set(&lock->count, 0);
+ __mutex_clear_flag(lock, MUTEX_FLAGS);
+
debug_mutex_free_waiter(&waiter);
skip_wait:
/* got the lock - cleanup and rejoice! */
lock_acquired(&lock->dep_map, ip);
- mutex_set_owner(lock);
- if (use_ww_ctx) {
- struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+ if (use_ww_ctx && ww_ctx)
ww_mutex_set_context_slowpath(ww, ww_ctx);
- }
- spin_unlock_mutex(&lock->wait_lock, flags);
+ spin_unlock(&lock->wait_lock);
preempt_enable();
return 0;
err:
- mutex_remove_waiter(lock, &waiter, task);
- spin_unlock_mutex(&lock->wait_lock, flags);
+ __set_current_state(TASK_RUNNING);
+ mutex_remove_waiter(lock, &waiter, current);
+err_early_backoff:
+ spin_unlock(&lock->wait_lock);
debug_mutex_free_waiter(&waiter);
mutex_release(&lock->dep_map, 1, ip);
preempt_enable();
return ret;
}
+static int __sched
+__mutex_lock(struct mutex *lock, long state, unsigned int subclass,
+ struct lockdep_map *nest_lock, unsigned long ip)
+{
+ return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false);
+}
+
+static int __sched
+__ww_mutex_lock(struct mutex *lock, long state, unsigned int subclass,
+ struct lockdep_map *nest_lock, unsigned long ip,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ return __mutex_lock_common(lock, state, subclass, nest_lock, ip, ww_ctx, true);
+}
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __sched
mutex_lock_nested(struct mutex *lock, unsigned int subclass)
{
- might_sleep();
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
- subclass, NULL, _RET_IP_, NULL, 0);
+ __mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -627,32 +911,38 @@ EXPORT_SYMBOL_GPL(mutex_lock_nested);
void __sched
_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
{
- might_sleep();
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
- 0, nest, _RET_IP_, NULL, 0);
+ __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
}
-
EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
int __sched
mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
{
- might_sleep();
- return __mutex_lock_common(lock, TASK_KILLABLE,
- subclass, NULL, _RET_IP_, NULL, 0);
+ return __mutex_lock(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
int __sched
mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
{
- might_sleep();
- return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
- subclass, NULL, _RET_IP_, NULL, 0);
+ return __mutex_lock(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_);
}
-
EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
+void __sched
+mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
+{
+ int token;
+
+ might_sleep();
+
+ token = io_schedule_prepare();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
+ subclass, NULL, _RET_IP_, NULL, 0);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
+
static inline int
ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
@@ -680,89 +970,102 @@ ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
}
int __sched
-__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
int ret;
might_sleep();
- ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
- 0, &ctx->dep_map, _RET_IP_, ctx, 1);
- if (!ret && ctx->acquired > 1)
+ ret = __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE,
+ 0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
+ ctx);
+ if (!ret && ctx && ctx->acquired > 1)
return ww_mutex_deadlock_injection(lock, ctx);
return ret;
}
-EXPORT_SYMBOL_GPL(__ww_mutex_lock);
+EXPORT_SYMBOL_GPL(ww_mutex_lock);
int __sched
-__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
int ret;
might_sleep();
- ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
- 0, &ctx->dep_map, _RET_IP_, ctx, 1);
+ ret = __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE,
+ 0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
+ ctx);
- if (!ret && ctx->acquired > 1)
+ if (!ret && ctx && ctx->acquired > 1)
return ww_mutex_deadlock_injection(lock, ctx);
return ret;
}
-EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
+EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
#endif
/*
* Release the lock, slowpath:
*/
-static inline void
-__mutex_unlock_common_slowpath(struct mutex *lock, int nested)
+static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip)
{
- unsigned long flags;
- WAKE_Q(wake_q);
+ struct task_struct *next = NULL;
+ DEFINE_WAKE_Q(wake_q);
+ unsigned long owner;
+
+ mutex_release(&lock->dep_map, 1, ip);
/*
- * As a performance measurement, release the lock before doing other
- * wakeup related duties to follow. This allows other tasks to acquire
- * the lock sooner, while still handling cleanups in past unlock calls.
- * This can be done as we do not enforce strict equivalence between the
- * mutex counter and wait_list.
- *
+ * Release the lock before (potentially) taking the spinlock such that
+ * other contenders can get on with things ASAP.
*
- * Some architectures leave the lock unlocked in the fastpath failure
- * case, others need to leave it locked. In the later case we have to
- * unlock it here - as the lock counter is currently 0 or negative.
+ * Except when HANDOFF, in that case we must not clear the owner field,
+ * but instead set it to the top waiter.
*/
- if (__mutex_slowpath_needs_to_unlock())
- atomic_set(&lock->count, 1);
+ owner = atomic_long_read(&lock->owner);
+ for (;;) {
+ unsigned long old;
- spin_lock_mutex(&lock->wait_lock, flags);
- mutex_release(&lock->dep_map, nested, _RET_IP_);
- debug_mutex_unlock(lock);
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
+ DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
+#endif
+ if (owner & MUTEX_FLAG_HANDOFF)
+ break;
+
+ old = atomic_long_cmpxchg_release(&lock->owner, owner,
+ __owner_flags(owner));
+ if (old == owner) {
+ if (owner & MUTEX_FLAG_WAITERS)
+ break;
+
+ return;
+ }
+
+ owner = old;
+ }
+
+ spin_lock(&lock->wait_lock);
+ debug_mutex_unlock(lock);
if (!list_empty(&lock->wait_list)) {
/* get the first entry from the wait-list: */
struct mutex_waiter *waiter =
- list_entry(lock->wait_list.next,
- struct mutex_waiter, list);
+ list_first_entry(&lock->wait_list,
+ struct mutex_waiter, list);
+
+ next = waiter->task;
debug_mutex_wake_waiter(lock, waiter);
- wake_q_add(&wake_q, waiter->task);
+ wake_q_add(&wake_q, next);
}
- spin_unlock_mutex(&lock->wait_lock, flags);
- wake_up_q(&wake_q);
-}
+ if (owner & MUTEX_FLAG_HANDOFF)
+ __mutex_handoff(lock, next);
-/*
- * Release the lock, slowpath:
- */
-__visible void
-__mutex_unlock_slowpath(atomic_t *lock_count)
-{
- struct mutex *lock = container_of(lock_count, struct mutex, count);
+ spin_unlock(&lock->wait_lock);
- __mutex_unlock_common_slowpath(lock, 1);
+ wake_up_q(&wake_q);
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
@@ -789,104 +1092,72 @@ __mutex_lock_interruptible_slowpath(struct mutex *lock);
*/
int __sched mutex_lock_interruptible(struct mutex *lock)
{
- int ret;
-
might_sleep();
- ret = __mutex_fastpath_lock_retval(&lock->count);
- if (likely(!ret)) {
- mutex_set_owner(lock);
+
+ if (__mutex_trylock_fast(lock))
return 0;
- } else
- return __mutex_lock_interruptible_slowpath(lock);
+
+ return __mutex_lock_interruptible_slowpath(lock);
}
EXPORT_SYMBOL(mutex_lock_interruptible);
int __sched mutex_lock_killable(struct mutex *lock)
{
- int ret;
-
might_sleep();
- ret = __mutex_fastpath_lock_retval(&lock->count);
- if (likely(!ret)) {
- mutex_set_owner(lock);
+
+ if (__mutex_trylock_fast(lock))
return 0;
- } else
- return __mutex_lock_killable_slowpath(lock);
+
+ return __mutex_lock_killable_slowpath(lock);
}
EXPORT_SYMBOL(mutex_lock_killable);
-__visible void __sched
-__mutex_lock_slowpath(atomic_t *lock_count)
+void __sched mutex_lock_io(struct mutex *lock)
{
- struct mutex *lock = container_of(lock_count, struct mutex, count);
+ int token;
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
- NULL, _RET_IP_, NULL, 0);
+ token = io_schedule_prepare();
+ mutex_lock(lock);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io);
+
+static noinline void __sched
+__mutex_lock_slowpath(struct mutex *lock)
+{
+ __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
}
static noinline int __sched
__mutex_lock_killable_slowpath(struct mutex *lock)
{
- return __mutex_lock_common(lock, TASK_KILLABLE, 0,
- NULL, _RET_IP_, NULL, 0);
+ return __mutex_lock(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
}
static noinline int __sched
__mutex_lock_interruptible_slowpath(struct mutex *lock)
{
- return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
- NULL, _RET_IP_, NULL, 0);
+ return __mutex_lock(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
}
static noinline int __sched
__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
- return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
- NULL, _RET_IP_, ctx, 1);
+ return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, NULL,
+ _RET_IP_, ctx);
}
static noinline int __sched
__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
struct ww_acquire_ctx *ctx)
{
- return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0,
- NULL, _RET_IP_, ctx, 1);
+ return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, NULL,
+ _RET_IP_, ctx);
}
#endif
-/*
- * Spinlock based trylock, we take the spinlock and check whether we
- * can get the lock:
- */
-static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
-{
- struct mutex *lock = container_of(lock_count, struct mutex, count);
- unsigned long flags;
- int prev;
-
- /* No need to trylock if the mutex is locked. */
- if (mutex_is_locked(lock))
- return 0;
-
- spin_lock_mutex(&lock->wait_lock, flags);
-
- prev = atomic_xchg_acquire(&lock->count, -1);
- if (likely(prev == 1)) {
- mutex_set_owner(lock);
- mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
- }
-
- /* Set it back to 0 if there are no waiters: */
- if (likely(list_empty(&lock->wait_list)))
- atomic_set(&lock->count, 0);
-
- spin_unlock_mutex(&lock->wait_lock, flags);
-
- return prev == 1;
-}
-
/**
* mutex_trylock - try to acquire the mutex, without waiting
* @lock: the mutex to be acquired
@@ -903,52 +1174,45 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
*/
int __sched mutex_trylock(struct mutex *lock)
{
- int ret;
+ bool locked = __mutex_trylock(lock);
- ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
- if (ret)
- mutex_set_owner(lock);
+ if (locked)
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
- return ret;
+ return locked;
}
EXPORT_SYMBOL(mutex_trylock);
#ifndef CONFIG_DEBUG_LOCK_ALLOC
int __sched
-__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
- int ret;
-
might_sleep();
- ret = __mutex_fastpath_lock_retval(&lock->base.count);
+ if (__mutex_trylock_fast(&lock->base)) {
+ if (ctx)
+ ww_mutex_set_context_fastpath(lock, ctx);
+ return 0;
+ }
- if (likely(!ret)) {
- ww_mutex_set_context_fastpath(lock, ctx);
- mutex_set_owner(&lock->base);
- } else
- ret = __ww_mutex_lock_slowpath(lock, ctx);
- return ret;
+ return __ww_mutex_lock_slowpath(lock, ctx);
}
-EXPORT_SYMBOL(__ww_mutex_lock);
+EXPORT_SYMBOL(ww_mutex_lock);
int __sched
-__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
- int ret;
-
might_sleep();
- ret = __mutex_fastpath_lock_retval(&lock->base.count);
+ if (__mutex_trylock_fast(&lock->base)) {
+ if (ctx)
+ ww_mutex_set_context_fastpath(lock, ctx);
+ return 0;
+ }
- if (likely(!ret)) {
- ww_mutex_set_context_fastpath(lock, ctx);
- mutex_set_owner(&lock->base);
- } else
- ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx);
- return ret;
+ return __ww_mutex_lock_interruptible_slowpath(lock, ctx);
}
-EXPORT_SYMBOL(__ww_mutex_lock_interruptible);
+EXPORT_SYMBOL(ww_mutex_lock_interruptible);
#endif
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 6cd6b8e9efd7..6ebc1902f779 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -9,39 +9,9 @@
* !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
*/
-#define spin_lock_mutex(lock, flags) \
- do { spin_lock(lock); (void)(flags); } while (0)
-#define spin_unlock_mutex(lock, flags) \
- do { spin_unlock(lock); (void)(flags); } while (0)
#define mutex_remove_waiter(lock, waiter, task) \
__list_del((waiter)->list.prev, (waiter)->list.next)
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-/*
- * The mutex owner can get read and written to locklessly.
- * We should use WRITE_ONCE when writing the owner value to
- * avoid store tearing, otherwise, a thread could potentially
- * read a partially written and incomplete owner value.
- */
-static inline void mutex_set_owner(struct mutex *lock)
-{
- WRITE_ONCE(lock->owner, current);
-}
-
-static inline void mutex_clear_owner(struct mutex *lock)
-{
- WRITE_ONCE(lock->owner, NULL);
-}
-#else
-static inline void mutex_set_owner(struct mutex *lock)
-{
-}
-
-static inline void mutex_clear_owner(struct mutex *lock)
-{
-}
-#endif
-
#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
#define debug_mutex_free_waiter(waiter) do { } while (0)
#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 05a37857ab55..a3167941093b 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -21,6 +21,11 @@ static inline int encode_cpu(int cpu_nr)
return cpu_nr + 1;
}
+static inline int node_cpu(struct optimistic_spin_node *node)
+{
+ return node->cpu - 1;
+}
+
static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
{
int cpu_nr = encoded_cpu_val - 1;
@@ -75,7 +80,7 @@ osq_wait_next(struct optimistic_spin_queue *lock,
break;
}
- cpu_relax_lowlatency();
+ cpu_relax();
}
return next;
@@ -118,11 +123,13 @@ bool osq_lock(struct optimistic_spin_queue *lock)
while (!READ_ONCE(node->locked)) {
/*
* If we need to reschedule bail... so we can block.
+ * Use vcpu_is_preempted() to avoid waiting for a preempted
+ * lock holder:
*/
- if (need_resched())
+ if (need_resched() || vcpu_is_preempted(node_cpu(node->prev)))
goto unqueue;
- cpu_relax_lowlatency();
+ cpu_relax();
}
return true;
@@ -148,7 +155,7 @@ unqueue:
if (smp_load_acquire(&node->locked))
return true;
- cpu_relax_lowlatency();
+ cpu_relax();
/*
* Or we race against a concurrent unqueue()'s step-B, in which
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index bec0b647f9cc..883cf1b92d90 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,159 +1,192 @@
#include <linux/atomic.h>
#include <linux/rwsem.h>
#include <linux/percpu.h>
-#include <linux/wait.h>
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/errno.h>
-int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
+int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
const char *name, struct lock_class_key *rwsem_key)
{
- brw->fast_read_ctr = alloc_percpu(int);
- if (unlikely(!brw->fast_read_ctr))
+ sem->read_count = alloc_percpu(int);
+ if (unlikely(!sem->read_count))
return -ENOMEM;
/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
- __init_rwsem(&brw->rw_sem, name, rwsem_key);
- rcu_sync_init(&brw->rss, RCU_SCHED_SYNC);
- atomic_set(&brw->slow_read_ctr, 0);
- init_waitqueue_head(&brw->write_waitq);
+ rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+ __init_rwsem(&sem->rw_sem, name, rwsem_key);
+ rcuwait_init(&sem->writer);
+ sem->readers_block = 0;
return 0;
}
EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
-void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
+void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
{
/*
* XXX: temporary kludge. The error path in alloc_super()
* assumes that percpu_free_rwsem() is safe after kzalloc().
*/
- if (!brw->fast_read_ctr)
+ if (!sem->read_count)
return;
- rcu_sync_dtor(&brw->rss);
- free_percpu(brw->fast_read_ctr);
- brw->fast_read_ctr = NULL; /* catch use after free bugs */
+ rcu_sync_dtor(&sem->rss);
+ free_percpu(sem->read_count);
+ sem->read_count = NULL; /* catch use after free bugs */
}
EXPORT_SYMBOL_GPL(percpu_free_rwsem);
-/*
- * This is the fast-path for down_read/up_read. If it succeeds we rely
- * on the barriers provided by rcu_sync_enter/exit; see the comments in
- * percpu_down_write() and percpu_up_write().
- *
- * If this helper fails the callers rely on the normal rw_semaphore and
- * atomic_dec_and_test(), so in this case we have the necessary barriers.
- */
-static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
+int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
{
- bool success;
+ /*
+ * Due to having preemption disabled the decrement happens on
+ * the same CPU as the increment, avoiding the
+ * increment-on-one-CPU-and-decrement-on-another problem.
+ *
+ * If the reader misses the writer's assignment of readers_block, then
+ * the writer is guaranteed to see the reader's increment.
+ *
+ * Conversely, any readers that increment their sem->read_count after
+ * the writer looks are guaranteed to see the readers_block value,
+ * which in turn means that they are guaranteed to immediately
+ * decrement their sem->read_count, so that it doesn't matter that the
+ * writer missed them.
+ */
- preempt_disable();
- success = rcu_sync_is_idle(&brw->rss);
- if (likely(success))
- __this_cpu_add(*brw->fast_read_ctr, val);
- preempt_enable();
+ smp_mb(); /* A matches D */
- return success;
-}
+ /*
+ * If !readers_block the critical section starts here, matched by the
+ * release in percpu_up_write().
+ */
+ if (likely(!smp_load_acquire(&sem->readers_block)))
+ return 1;
-/*
- * Like the normal down_read() this is not recursive, the writer can
- * come after the first percpu_down_read() and create the deadlock.
- *
- * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
- * percpu_up_read() does rwsem_release(). This pairs with the usage
- * of ->rw_sem in percpu_down/up_write().
- */
-void percpu_down_read(struct percpu_rw_semaphore *brw)
-{
- might_sleep();
- rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
+ /*
+ * Per the above comment; we still have preemption disabled and
+ * will thus decrement on the same CPU as we incremented.
+ */
+ __percpu_up_read(sem);
- if (likely(update_fast_ctr(brw, +1)))
- return;
+ if (try)
+ return 0;
- /* Avoid rwsem_acquire_read() and rwsem_release() */
- __down_read(&brw->rw_sem);
- atomic_inc(&brw->slow_read_ctr);
- __up_read(&brw->rw_sem);
-}
-EXPORT_SYMBOL_GPL(percpu_down_read);
+ /*
+ * We either call schedule() in the wait, or we'll fall through
+ * and reschedule on the preempt_enable() in percpu_down_read().
+ */
+ preempt_enable_no_resched();
-int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
-{
- if (unlikely(!update_fast_ctr(brw, +1))) {
- if (!__down_read_trylock(&brw->rw_sem))
- return 0;
- atomic_inc(&brw->slow_read_ctr);
- __up_read(&brw->rw_sem);
- }
-
- rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
+ /*
+ * Avoid lockdep for the down/up_read() we already have them.
+ */
+ __down_read(&sem->rw_sem);
+ this_cpu_inc(*sem->read_count);
+ __up_read(&sem->rw_sem);
+
+ preempt_disable();
return 1;
}
+EXPORT_SYMBOL_GPL(__percpu_down_read);
-void percpu_up_read(struct percpu_rw_semaphore *brw)
+void __percpu_up_read(struct percpu_rw_semaphore *sem)
{
- rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
-
- if (likely(update_fast_ctr(brw, -1)))
- return;
+ smp_mb(); /* B matches C */
+ /*
+ * In other words, if they see our decrement (presumably to aggregate
+ * zero, as that is the only time it matters) they will also see our
+ * critical section.
+ */
+ __this_cpu_dec(*sem->read_count);
- /* false-positive is possible but harmless */
- if (atomic_dec_and_test(&brw->slow_read_ctr))
- wake_up_all(&brw->write_waitq);
+ /* Prod writer to recheck readers_active */
+ rcuwait_wake_up(&sem->writer);
}
-EXPORT_SYMBOL_GPL(percpu_up_read);
+EXPORT_SYMBOL_GPL(__percpu_up_read);
+
+#define per_cpu_sum(var) \
+({ \
+ typeof(var) __sum = 0; \
+ int cpu; \
+ compiletime_assert_atomic_type(__sum); \
+ for_each_possible_cpu(cpu) \
+ __sum += per_cpu(var, cpu); \
+ __sum; \
+})
-static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
+/*
+ * Return true if the modular sum of the sem->read_count per-CPU variable is
+ * zero. If this sum is zero, then it is stable due to the fact that if any
+ * newly arriving readers increment a given counter, they will immediately
+ * decrement that same counter.
+ */
+static bool readers_active_check(struct percpu_rw_semaphore *sem)
{
- unsigned int sum = 0;
- int cpu;
+ if (per_cpu_sum(*sem->read_count) != 0)
+ return false;
+
+ /*
+ * If we observed the decrement; ensure we see the entire critical
+ * section.
+ */
- for_each_possible_cpu(cpu) {
- sum += per_cpu(*brw->fast_read_ctr, cpu);
- per_cpu(*brw->fast_read_ctr, cpu) = 0;
- }
+ smp_mb(); /* C matches B */
- return sum;
+ return true;
}
-void percpu_down_write(struct percpu_rw_semaphore *brw)
+void percpu_down_write(struct percpu_rw_semaphore *sem)
{
+ /* Notify readers to take the slow path. */
+ rcu_sync_enter(&sem->rss);
+
+ down_write(&sem->rw_sem);
+
/*
- * Make rcu_sync_is_idle() == F and thus disable the fast-path in
- * percpu_down_read() and percpu_up_read(), and wait for gp pass.
- *
- * The latter synchronises us with the preceding readers which used
- * the fast-past, so we can not miss the result of __this_cpu_add()
- * or anything else inside their criticial sections.
+ * Notify new readers to block; up until now, and thus throughout the
+ * longish rcu_sync_enter() above, new readers could still come in.
*/
- rcu_sync_enter(&brw->rss);
+ WRITE_ONCE(sem->readers_block, 1);
- /* exclude other writers, and block the new readers completely */
- down_write(&brw->rw_sem);
+ smp_mb(); /* D matches A */
- /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
- atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
+ /*
+ * If they don't see our writer of readers_block, then we are
+ * guaranteed to see their sem->read_count increment, and therefore
+ * will wait for them.
+ */
- /* wait for all readers to complete their percpu_up_read() */
- wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
+ /* Wait for all now active readers to complete. */
+ rcuwait_wait_event(&sem->writer, readers_active_check(sem));
}
EXPORT_SYMBOL_GPL(percpu_down_write);
-void percpu_up_write(struct percpu_rw_semaphore *brw)
+void percpu_up_write(struct percpu_rw_semaphore *sem)
{
- /* release the lock, but the readers can't use the fast-path */
- up_write(&brw->rw_sem);
/*
- * Enable the fast-path in percpu_down_read() and percpu_up_read()
- * but only after another gp pass; this adds the necessary barrier
- * to ensure the reader can't miss the changes done by us.
+ * Signal the writer is done, no fast path yet.
+ *
+ * One reason that we cannot just immediately flip to readers_fast is
+ * that new readers might fail to see the results of this writer's
+ * critical section.
+ *
+ * Therefore we force it through the slow path which guarantees an
+ * acquire and thereby guarantees the critical section's consistency.
+ */
+ smp_store_release(&sem->readers_block, 0);
+
+ /*
+ * Release the write lock, this will allow readers back in the game.
+ */
+ up_write(&sem->rw_sem);
+
+ /*
+ * Once this completes (at least one RCU-sched grace period hence) the
+ * reader fast path will be available again. Safe to use outside the
+ * exclusive write lock because its counting.
*/
- rcu_sync_exit(&brw->rss);
+ rcu_sync_exit(&sem->rss);
}
EXPORT_SYMBOL_GPL(percpu_up_write);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 19248ddf37ce..cc3ed0ccdfa2 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -54,7 +54,7 @@ static __always_inline void
rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
{
while ((cnts & _QW_WMASK) == _QW_LOCKED) {
- cpu_relax_lowlatency();
+ cpu_relax();
cnts = atomic_read_acquire(&lock->cnts);
}
}
@@ -130,7 +130,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
(cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0))
break;
- cpu_relax_lowlatency();
+ cpu_relax();
}
/* When no more readers, set the locked flag */
@@ -141,7 +141,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
_QW_LOCKED) == _QW_WAITING))
break;
- cpu_relax_lowlatency();
+ cpu_relax();
}
unlock:
arch_spin_unlock(&lock->wait_lock);
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 8a99abf58080..e6b2f7ad3e51 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -70,11 +70,14 @@ struct pv_node {
static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
- int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
- (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
- qstat_inc(qstat_pv_lock_stealing, ret);
- return ret;
+ if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+ (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+ qstat_inc(qstat_pv_lock_stealing, true);
+ return true;
+ }
+
+ return false;
}
/*
@@ -257,11 +260,10 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
static inline bool
pv_wait_early(struct pv_node *prev, int loop)
{
-
if ((loop & PV_PREV_CHECK_MASK) != 0)
return false;
- return READ_ONCE(prev->state) != vcpu_running;
+ return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu);
}
/*
@@ -286,12 +288,10 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
{
struct pv_node *pn = (struct pv_node *)node;
struct pv_node *pp = (struct pv_node *)prev;
- int waitcnt = 0;
int loop;
bool wait_early;
- /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
- for (;; waitcnt++) {
+ for (;;) {
for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
if (READ_ONCE(node->locked))
return;
@@ -315,7 +315,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
if (!READ_ONCE(node->locked)) {
qstat_inc(qstat_pv_wait_node, true);
- qstat_inc(qstat_pv_wait_again, waitcnt);
qstat_inc(qstat_pv_wait_early, wait_early);
pv_wait(&pn->state, vcpu_halted);
}
@@ -456,12 +455,9 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
pv_wait(&l->locked, _Q_SLOW_VAL);
/*
- * The unlocker should have freed the lock before kicking the
- * CPU. So if the lock is still not free, it is a spurious
- * wakeup or another vCPU has stolen the lock. The current
- * vCPU should spin again.
+ * Because of lock stealing, the queue head vCPU may not be
+ * able to acquire the lock before it has to wait again.
*/
- qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
}
/*
@@ -544,7 +540,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
* unhash. Otherwise it would be possible to have multiple @lock
* entries, which would be BAD.
*/
- locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0);
if (likely(locked == _Q_LOCKED_VAL))
return;
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index b9d031516254..e852be4851fc 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -24,8 +24,8 @@
* pv_latency_wake - average latency (ns) from vCPU kick to wakeup
* pv_lock_slowpath - # of locking operations via the slowpath
* pv_lock_stealing - # of lock stealing operations
- * pv_spurious_wakeup - # of spurious wakeups
- * pv_wait_again - # of vCPU wait's that happened after a vCPU kick
+ * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs
+ * pv_wait_again - # of wait's after a queue head vCPU kick
* pv_wait_early - # of early vCPU wait's
* pv_wait_head - # of vCPU wait's at the queue head
* pv_wait_node - # of vCPU wait's at a non-head queue node
@@ -108,11 +108,7 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf,
/*
* Get the counter ID stored in file->f_inode->i_private
*/
- if (!file->f_inode) {
- WARN_ON_ONCE(1);
- return -EBADF;
- }
- counter = (long)(file->f_inode->i_private);
+ counter = (long)file_inode(file)->i_private;
if (counter >= qstat_num)
return -EBADF;
@@ -177,11 +173,7 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf,
/*
* Get the counter ID stored in file->f_inode->i_private
*/
- if (!file->f_inode) {
- WARN_ON_ONCE(1);
- return -EBADF;
- }
- if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
+ if ((long)file_inode(file)->i_private != qstat_reset_cnts)
return count;
for_each_possible_cpu(cpu) {
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 1ec0f48962b3..d340be3a488f 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -65,8 +65,72 @@ static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
{
- if (!rt_mutex_has_waiters(lock))
- clear_rt_mutex_waiters(lock);
+ unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+ if (rt_mutex_has_waiters(lock))
+ return;
+
+ /*
+ * The rbtree has no waiters enqueued, now make sure that the
+ * lock->owner still has the waiters bit set, otherwise the
+ * following can happen:
+ *
+ * CPU 0 CPU 1 CPU2
+ * l->owner=T1
+ * rt_mutex_lock(l)
+ * lock(l->lock)
+ * l->owner = T1 | HAS_WAITERS;
+ * enqueue(T2)
+ * boost()
+ * unlock(l->lock)
+ * block()
+ *
+ * rt_mutex_lock(l)
+ * lock(l->lock)
+ * l->owner = T1 | HAS_WAITERS;
+ * enqueue(T3)
+ * boost()
+ * unlock(l->lock)
+ * block()
+ * signal(->T2) signal(->T3)
+ * lock(l->lock)
+ * dequeue(T2)
+ * deboost()
+ * unlock(l->lock)
+ * lock(l->lock)
+ * dequeue(T3)
+ * ==> wait list is empty
+ * deboost()
+ * unlock(l->lock)
+ * lock(l->lock)
+ * fixup_rt_mutex_waiters()
+ * if (wait_list_empty(l) {
+ * l->owner = owner
+ * owner = l->owner & ~HAS_WAITERS;
+ * ==> l->owner = T1
+ * }
+ * lock(l->lock)
+ * rt_mutex_unlock(l) fixup_rt_mutex_waiters()
+ * if (wait_list_empty(l) {
+ * owner = l->owner & ~HAS_WAITERS;
+ * cmpxchg(l->owner, T1, NULL)
+ * ===> Success (l->owner = NULL)
+ *
+ * l->owner = owner
+ * ==> l->owner = T1
+ * }
+ *
+ * With the check for the waiter bit in place T3 on CPU2 will not
+ * overwrite. All tasks fiddling with the waiters bit are
+ * serialized by l->lock, so nothing else can modify the waiters
+ * bit. If the bit is set then nothing can change l->owner either
+ * so the simple RMW is safe. The cmpxchg() will simply fail if it
+ * happens in the middle of the RMW because the waiters bit is
+ * still set.
+ */
+ owner = READ_ONCE(*p);
+ if (owner & RT_MUTEX_HAS_WAITERS)
+ WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
}
/*
@@ -1115,7 +1179,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
* TASK_INTERRUPTIBLE checks for signals and
* timeout. Ignored otherwise.
*/
- if (unlikely(state == TASK_INTERRUPTIBLE)) {
+ if (likely(state == TASK_INTERRUPTIBLE)) {
/* Signal pending? */
if (signal_pending(current))
ret = -EINTR;
@@ -1382,7 +1446,7 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
bool (*slowfn)(struct rt_mutex *lock,
struct wake_q_head *wqh))
{
- WAKE_Q(wake_q);
+ DEFINE_WAKE_Q(wake_q);
if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
rt_mutex_deadlock_account_unlock(current);
@@ -1555,11 +1619,15 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
* rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
* proxy owner
*
- * @lock: the rt_mutex to be locked
+ * @lock: the rt_mutex to be locked
* @proxy_owner:the task to set as owner
*
* No locking. Caller has to do serializing itself
- * Special API call for PI-futex support
+ *
+ * Special API call for PI-futex support. This initializes the rtmutex and
+ * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not
+ * possible at this point because the pi_state which contains the rtmutex
+ * is not yet visible to other tasks.
*/
void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner)
@@ -1573,10 +1641,14 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
/**
* rt_mutex_proxy_unlock - release a lock on behalf of owner
*
- * @lock: the rt_mutex to be locked
+ * @lock: the rt_mutex to be locked
*
* No locking. Caller has to do serializing itself
- * Special API call for PI-futex support
+ *
+ * Special API call for PI-futex support. This merrily cleans up the rtmutex
+ * (debugging) state. Concurrent operations on this rt_mutex are not
+ * possible because it belongs to the pi_state which is about to be freed
+ * and it is not longer visible to other tasks.
*/
void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner)
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 4f5f83c7d2d3..990134617b4c 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -71,12 +71,12 @@ task_top_pi_waiter(struct task_struct *p)
* lock->owner state tracking:
*/
#define RT_MUTEX_HAS_WAITERS 1UL
-#define RT_MUTEX_OWNER_MASKALL 1UL
static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
{
- return (struct task_struct *)
- ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
+ unsigned long owner = (unsigned long) READ_ONCE(lock->owner);
+
+ return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS);
}
/*
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 1591f6b3539f..5eacab880f67 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -128,7 +128,6 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem)
void __sched __down_read(struct rw_semaphore *sem)
{
struct rwsem_waiter waiter;
- struct task_struct *tsk;
unsigned long flags;
raw_spin_lock_irqsave(&sem->wait_lock, flags);
@@ -140,13 +139,12 @@ void __sched __down_read(struct rw_semaphore *sem)
goto out;
}
- tsk = current;
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE);
/* set up my own style of waitqueue */
- waiter.task = tsk;
+ waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_READ;
- get_task_struct(tsk);
+ get_task_struct(current);
list_add_tail(&waiter.list, &sem->wait_list);
@@ -158,10 +156,10 @@ void __sched __down_read(struct rw_semaphore *sem)
if (!waiter.task)
break;
schedule();
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE);
}
- __set_task_state(tsk, TASK_RUNNING);
+ __set_current_state(TASK_RUNNING);
out:
;
}
@@ -194,15 +192,13 @@ int __down_read_trylock(struct rw_semaphore *sem)
int __sched __down_write_common(struct rw_semaphore *sem, int state)
{
struct rwsem_waiter waiter;
- struct task_struct *tsk;
unsigned long flags;
int ret = 0;
raw_spin_lock_irqsave(&sem->wait_lock, flags);
/* set up my own style of waitqueue */
- tsk = current;
- waiter.task = tsk;
+ waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_WRITE;
list_add_tail(&waiter.list, &sem->wait_list);
@@ -220,7 +216,7 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
ret = -EINTR;
goto out;
}
- set_task_state(tsk, state);
+ set_current_state(state);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
schedule();
raw_spin_lock_irqsave(&sem->wait_lock, flags);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 447e08de1fab..2ad8d8dc3bb1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -121,16 +121,19 @@ enum rwsem_wake_type {
* - woken process blocks are discarded from the list after having task zeroed
* - writers are only marked woken if downgrading is false
*/
-static struct rw_semaphore *
-__rwsem_mark_wake(struct rw_semaphore *sem,
- enum rwsem_wake_type wake_type, struct wake_q_head *wake_q)
+static void __rwsem_mark_wake(struct rw_semaphore *sem,
+ enum rwsem_wake_type wake_type,
+ struct wake_q_head *wake_q)
{
- struct rwsem_waiter *waiter;
- struct task_struct *tsk;
- struct list_head *next;
- long oldcount, woken, loop, adjustment;
+ struct rwsem_waiter *waiter, *tmp;
+ long oldcount, woken = 0, adjustment = 0;
+
+ /*
+ * Take a peek at the queue head waiter such that we can determine
+ * the wakeup(s) to perform.
+ */
+ waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list);
- waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
if (wake_type == RWSEM_WAKE_ANY) {
/*
@@ -142,19 +145,19 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
*/
wake_q_add(wake_q, waiter->task);
}
- goto out;
+
+ return;
}
- /* Writers might steal the lock before we grant it to the next reader.
+ /*
+ * Writers might steal the lock before we grant it to the next reader.
* We prefer to do the first reader grant before counting readers
* so we can bail out early if a writer stole the lock.
*/
- adjustment = 0;
if (wake_type != RWSEM_WAKE_READ_OWNED) {
adjustment = RWSEM_ACTIVE_READ_BIAS;
try_reader_grant:
oldcount = atomic_long_fetch_add(adjustment, &sem->count);
-
if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
/*
* If the count is still less than RWSEM_WAITING_BIAS
@@ -164,7 +167,8 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
*/
if (atomic_long_add_return(-adjustment, &sem->count) <
RWSEM_WAITING_BIAS)
- goto out;
+ return;
+
/* Last active locker left. Retry waking readers. */
goto try_reader_grant;
}
@@ -176,38 +180,23 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
rwsem_set_reader_owned(sem);
}
- /* Grant an infinite number of read locks to the readers at the front
- * of the queue. Note we increment the 'active part' of the count by
- * the number of readers before waking any processes up.
+ /*
+ * Grant an infinite number of read locks to the readers at the front
+ * of the queue. We know that woken will be at least 1 as we accounted
+ * for above. Note we increment the 'active part' of the count by the
+ * number of readers before waking any processes up.
*/
- woken = 0;
- do {
- woken++;
+ list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
+ struct task_struct *tsk;
- if (waiter->list.next == &sem->wait_list)
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE)
break;
- waiter = list_entry(waiter->list.next,
- struct rwsem_waiter, list);
-
- } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
-
- adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
- if (waiter->type != RWSEM_WAITING_FOR_WRITE)
- /* hit end of list above */
- adjustment -= RWSEM_WAITING_BIAS;
-
- if (adjustment)
- atomic_long_add(adjustment, &sem->count);
-
- next = sem->wait_list.next;
- loop = woken;
- do {
- waiter = list_entry(next, struct rwsem_waiter, list);
- next = waiter->list.next;
+ woken++;
tsk = waiter->task;
wake_q_add(wake_q, tsk);
+ list_del(&waiter->list);
/*
* Ensure that the last operation is setting the reader
* waiter to nil such that rwsem_down_read_failed() cannot
@@ -215,13 +204,16 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
* to the task to wakeup.
*/
smp_store_release(&waiter->task, NULL);
- } while (--loop);
+ }
- sem->wait_list.next = next;
- next->prev = &sem->wait_list;
+ adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
+ if (list_empty(&sem->wait_list)) {
+ /* hit end of list above */
+ adjustment -= RWSEM_WAITING_BIAS;
+ }
- out:
- return sem;
+ if (adjustment)
+ atomic_long_add(adjustment, &sem->count);
}
/*
@@ -232,11 +224,9 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
{
long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
struct rwsem_waiter waiter;
- struct task_struct *tsk = current;
- WAKE_Q(wake_q);
+ DEFINE_WAKE_Q(wake_q);
- /* set up my own style of waitqueue */
- waiter.task = tsk;
+ waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_READ;
raw_spin_lock_irq(&sem->wait_lock);
@@ -247,7 +237,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
/* we're now waiting on the lock, but no longer actively locking */
count = atomic_long_add_return(adjustment, &sem->count);
- /* If there are no active locks, wake the front queued process(es).
+ /*
+ * If there are no active locks, wake the front queued process(es).
*
* If there are no writers and we are first in the queue,
* wake our own waiter to join the existing active readers !
@@ -255,20 +246,20 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
if (count == RWSEM_WAITING_BIAS ||
(count > RWSEM_WAITING_BIAS &&
adjustment != -RWSEM_ACTIVE_READ_BIAS))
- sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+ __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
wake_up_q(&wake_q);
/* wait to be given the lock */
while (true) {
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE);
if (!waiter.task)
break;
schedule();
}
- __set_task_state(tsk, TASK_RUNNING);
+ __set_current_state(TASK_RUNNING);
return sem;
}
EXPORT_SYMBOL(rwsem_down_read_failed);
@@ -344,7 +335,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
goto done;
}
- ret = owner->on_cpu;
+ /*
+ * As lock holder preemption issue, we both skip spinning if task is not
+ * on cpu or its cpu is preempted
+ */
+ ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
done:
rcu_read_unlock();
return ret;
@@ -370,13 +365,17 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
*/
barrier();
- /* abort spinning when need_resched or owner is not running */
- if (!owner->on_cpu || need_resched()) {
+ /*
+ * abort spinning when need_resched or owner is not running or
+ * owner's cpu is preempted.
+ */
+ if (!owner->on_cpu || need_resched() ||
+ vcpu_is_preempted(task_cpu(owner))) {
rcu_read_unlock();
return false;
}
- cpu_relax_lowlatency();
+ cpu_relax();
}
rcu_read_unlock();
out:
@@ -431,7 +430,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
* memory barriers as we'll eventually observe the right
* values at the cost of a few extra spins.
*/
- cpu_relax_lowlatency();
+ cpu_relax();
}
osq_unlock(&sem->osq);
done:
@@ -469,7 +468,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
bool waiting = true; /* any queued threads before us */
struct rwsem_waiter waiter;
struct rw_semaphore *ret = sem;
- WAKE_Q(wake_q);
+ DEFINE_WAKE_Q(wake_q);
/* undo write bias from down_write operation, stop active locking */
count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count);
@@ -503,9 +502,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
* wake any read locks that were queued ahead of us.
*/
if (count > RWSEM_WAITING_BIAS) {
- WAKE_Q(wake_q);
-
- sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
+ __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
/*
* The wakeup is normally called _after_ the wait_lock
* is released, but given that we are proactively waking
@@ -514,6 +511,11 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
* for attempting rwsem_try_write_lock().
*/
wake_up_q(&wake_q);
+
+ /*
+ * Reinitialize wake_q after use.
+ */
+ wake_q_init(&wake_q);
}
} else
@@ -579,7 +581,7 @@ __visible
struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
{
unsigned long flags;
- WAKE_Q(wake_q);
+ DEFINE_WAKE_Q(wake_q);
/*
* If a spinner is present, it is not necessary to do the wakeup.
@@ -614,9 +616,8 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
locked:
- /* do nothing if list empty */
if (!list_empty(&sem->wait_list))
- sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+ __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
wake_up_q(&wake_q);
@@ -634,13 +635,12 @@ __visible
struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
{
unsigned long flags;
- WAKE_Q(wake_q);
+ DEFINE_WAKE_Q(wake_q);
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- /* do nothing if list empty */
if (!list_empty(&sem->wait_list))
- sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
+ __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
wake_up_q(&wake_q);
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index b8120abe594b..9512e37637dc 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -204,19 +204,18 @@ struct semaphore_waiter {
static inline int __sched __down_common(struct semaphore *sem, long state,
long timeout)
{
- struct task_struct *task = current;
struct semaphore_waiter waiter;
list_add_tail(&waiter.list, &sem->wait_list);
- waiter.task = task;
+ waiter.task = current;
waiter.up = false;
for (;;) {
- if (signal_pending_state(state, task))
+ if (signal_pending_state(state, current))
goto interrupted;
if (unlikely(timeout <= 0))
goto timed_out;
- __set_task_state(task, state);
+ __set_current_state(state);
raw_spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
raw_spin_lock_irq(&sem->lock);
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index db3ccb1dd614..4b082b5cac9e 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -363,14 +363,6 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
}
EXPORT_SYMBOL(_raw_spin_lock_nested);
-void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
-{
- __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
- spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
- LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
-}
-EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
-
unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
int subclass)
{
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 0374a596cffa..9aa0fccd5d43 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -103,38 +103,14 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock)
lock->owner_cpu = -1;
}
-static void __spin_lock_debug(raw_spinlock_t *lock)
-{
- u64 i;
- u64 loops = loops_per_jiffy * HZ;
-
- for (i = 0; i < loops; i++) {
- if (arch_spin_trylock(&lock->raw_lock))
- return;
- __delay(1);
- }
- /* lockup suspected: */
- spin_dump(lock, "lockup suspected");
-#ifdef CONFIG_SMP
- trigger_all_cpu_backtrace();
-#endif
-
- /*
- * The trylock above was causing a livelock. Give the lower level arch
- * specific lock code a chance to acquire the lock. We have already
- * printed a warning/backtrace at this point. The non-debug arch
- * specific code might actually succeed in acquiring the lock. If it is
- * not successful, the end-result is the same - there is no forward
- * progress.
- */
- arch_spin_lock(&lock->raw_lock);
-}
-
+/*
+ * We are now relying on the NMI watchdog to detect lockup instead of doing
+ * the detection here with an unfair lock which can cause problem of its own.
+ */
void do_raw_spin_lock(raw_spinlock_t *lock)
{
debug_spin_lock_before(lock);
- if (unlikely(!arch_spin_trylock(&lock->raw_lock)))
- __spin_lock_debug(lock);
+ arch_spin_lock(&lock->raw_lock);
debug_spin_lock_after(lock);
}
@@ -172,32 +148,6 @@ static void rwlock_bug(rwlock_t *lock, const char *msg)
#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg)
-#if 0 /* __write_lock_debug() can lock up - maybe this can too? */
-static void __read_lock_debug(rwlock_t *lock)
-{
- u64 i;
- u64 loops = loops_per_jiffy * HZ;
- int print_once = 1;
-
- for (;;) {
- for (i = 0; i < loops; i++) {
- if (arch_read_trylock(&lock->raw_lock))
- return;
- __delay(1);
- }
- /* lockup suspected: */
- if (print_once) {
- print_once = 0;
- printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, "
- "%s/%d, %p\n",
- raw_smp_processor_id(), current->comm,
- current->pid, lock);
- dump_stack();
- }
- }
-}
-#endif
-
void do_raw_read_lock(rwlock_t *lock)
{
RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
@@ -247,32 +197,6 @@ static inline void debug_write_unlock(rwlock_t *lock)
lock->owner_cpu = -1;
}
-#if 0 /* This can cause lockups */
-static void __write_lock_debug(rwlock_t *lock)
-{
- u64 i;
- u64 loops = loops_per_jiffy * HZ;
- int print_once = 1;
-
- for (;;) {
- for (i = 0; i < loops; i++) {
- if (arch_write_trylock(&lock->raw_lock))
- return;
- __delay(1);
- }
- /* lockup suspected: */
- if (print_once) {
- print_once = 0;
- printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, "
- "%s/%d, %p\n",
- raw_smp_processor_id(), current->comm,
- current->pid, lock);
- dump_stack();
- }
- }
-}
-#endif
-
void do_raw_write_lock(rwlock_t *lock)
{
debug_write_lock_before(lock);
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
new file mode 100644
index 000000000000..da6c9a34f62f
--- /dev/null
+++ b/kernel/locking/test-ww_mutex.c
@@ -0,0 +1,646 @@
+/*
+ * Module-based API test facility for ww_mutexes
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ */
+
+#include <linux/kernel.h>
+
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/ww_mutex.h>
+
+static DEFINE_WW_CLASS(ww_class);
+struct workqueue_struct *wq;
+
+struct test_mutex {
+ struct work_struct work;
+ struct ww_mutex mutex;
+ struct completion ready, go, done;
+ unsigned int flags;
+};
+
+#define TEST_MTX_SPIN BIT(0)
+#define TEST_MTX_TRY BIT(1)
+#define TEST_MTX_CTX BIT(2)
+#define __TEST_MTX_LAST BIT(3)
+
+static void test_mutex_work(struct work_struct *work)
+{
+ struct test_mutex *mtx = container_of(work, typeof(*mtx), work);
+
+ complete(&mtx->ready);
+ wait_for_completion(&mtx->go);
+
+ if (mtx->flags & TEST_MTX_TRY) {
+ while (!ww_mutex_trylock(&mtx->mutex))
+ cpu_relax();
+ } else {
+ ww_mutex_lock(&mtx->mutex, NULL);
+ }
+ complete(&mtx->done);
+ ww_mutex_unlock(&mtx->mutex);
+}
+
+static int __test_mutex(unsigned int flags)
+{
+#define TIMEOUT (HZ / 16)
+ struct test_mutex mtx;
+ struct ww_acquire_ctx ctx;
+ int ret;
+
+ ww_mutex_init(&mtx.mutex, &ww_class);
+ ww_acquire_init(&ctx, &ww_class);
+
+ INIT_WORK_ONSTACK(&mtx.work, test_mutex_work);
+ init_completion(&mtx.ready);
+ init_completion(&mtx.go);
+ init_completion(&mtx.done);
+ mtx.flags = flags;
+
+ schedule_work(&mtx.work);
+
+ wait_for_completion(&mtx.ready);
+ ww_mutex_lock(&mtx.mutex, (flags & TEST_MTX_CTX) ? &ctx : NULL);
+ complete(&mtx.go);
+ if (flags & TEST_MTX_SPIN) {
+ unsigned long timeout = jiffies + TIMEOUT;
+
+ ret = 0;
+ do {
+ if (completion_done(&mtx.done)) {
+ ret = -EINVAL;
+ break;
+ }
+ cpu_relax();
+ } while (time_before(jiffies, timeout));
+ } else {
+ ret = wait_for_completion_timeout(&mtx.done, TIMEOUT);
+ }
+ ww_mutex_unlock(&mtx.mutex);
+ ww_acquire_fini(&ctx);
+
+ if (ret) {
+ pr_err("%s(flags=%x): mutual exclusion failure\n",
+ __func__, flags);
+ ret = -EINVAL;
+ }
+
+ flush_work(&mtx.work);
+ destroy_work_on_stack(&mtx.work);
+ return ret;
+#undef TIMEOUT
+}
+
+static int test_mutex(void)
+{
+ int ret;
+ int i;
+
+ for (i = 0; i < __TEST_MTX_LAST; i++) {
+ ret = __test_mutex(i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int test_aa(void)
+{
+ struct ww_mutex mutex;
+ struct ww_acquire_ctx ctx;
+ int ret;
+
+ ww_mutex_init(&mutex, &ww_class);
+ ww_acquire_init(&ctx, &ww_class);
+
+ ww_mutex_lock(&mutex, &ctx);
+
+ if (ww_mutex_trylock(&mutex)) {
+ pr_err("%s: trylocked itself!\n", __func__);
+ ww_mutex_unlock(&mutex);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = ww_mutex_lock(&mutex, &ctx);
+ if (ret != -EALREADY) {
+ pr_err("%s: missed deadlock for recursing, ret=%d\n",
+ __func__, ret);
+ if (!ret)
+ ww_mutex_unlock(&mutex);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ ww_mutex_unlock(&mutex);
+ ww_acquire_fini(&ctx);
+ return ret;
+}
+
+struct test_abba {
+ struct work_struct work;
+ struct ww_mutex a_mutex;
+ struct ww_mutex b_mutex;
+ struct completion a_ready;
+ struct completion b_ready;
+ bool resolve;
+ int result;
+};
+
+static void test_abba_work(struct work_struct *work)
+{
+ struct test_abba *abba = container_of(work, typeof(*abba), work);
+ struct ww_acquire_ctx ctx;
+ int err;
+
+ ww_acquire_init(&ctx, &ww_class);
+ ww_mutex_lock(&abba->b_mutex, &ctx);
+
+ complete(&abba->b_ready);
+ wait_for_completion(&abba->a_ready);
+
+ err = ww_mutex_lock(&abba->a_mutex, &ctx);
+ if (abba->resolve && err == -EDEADLK) {
+ ww_mutex_unlock(&abba->b_mutex);
+ ww_mutex_lock_slow(&abba->a_mutex, &ctx);
+ err = ww_mutex_lock(&abba->b_mutex, &ctx);
+ }
+
+ if (!err)
+ ww_mutex_unlock(&abba->a_mutex);
+ ww_mutex_unlock(&abba->b_mutex);
+ ww_acquire_fini(&ctx);
+
+ abba->result = err;
+}
+
+static int test_abba(bool resolve)
+{
+ struct test_abba abba;
+ struct ww_acquire_ctx ctx;
+ int err, ret;
+
+ ww_mutex_init(&abba.a_mutex, &ww_class);
+ ww_mutex_init(&abba.b_mutex, &ww_class);
+ INIT_WORK_ONSTACK(&abba.work, test_abba_work);
+ init_completion(&abba.a_ready);
+ init_completion(&abba.b_ready);
+ abba.resolve = resolve;
+
+ schedule_work(&abba.work);
+
+ ww_acquire_init(&ctx, &ww_class);
+ ww_mutex_lock(&abba.a_mutex, &ctx);
+
+ complete(&abba.a_ready);
+ wait_for_completion(&abba.b_ready);
+
+ err = ww_mutex_lock(&abba.b_mutex, &ctx);
+ if (resolve && err == -EDEADLK) {
+ ww_mutex_unlock(&abba.a_mutex);
+ ww_mutex_lock_slow(&abba.b_mutex, &ctx);
+ err = ww_mutex_lock(&abba.a_mutex, &ctx);
+ }
+
+ if (!err)
+ ww_mutex_unlock(&abba.b_mutex);
+ ww_mutex_unlock(&abba.a_mutex);
+ ww_acquire_fini(&ctx);
+
+ flush_work(&abba.work);
+ destroy_work_on_stack(&abba.work);
+
+ ret = 0;
+ if (resolve) {
+ if (err || abba.result) {
+ pr_err("%s: failed to resolve ABBA deadlock, A err=%d, B err=%d\n",
+ __func__, err, abba.result);
+ ret = -EINVAL;
+ }
+ } else {
+ if (err != -EDEADLK && abba.result != -EDEADLK) {
+ pr_err("%s: missed ABBA deadlock, A err=%d, B err=%d\n",
+ __func__, err, abba.result);
+ ret = -EINVAL;
+ }
+ }
+ return ret;
+}
+
+struct test_cycle {
+ struct work_struct work;
+ struct ww_mutex a_mutex;
+ struct ww_mutex *b_mutex;
+ struct completion *a_signal;
+ struct completion b_signal;
+ int result;
+};
+
+static void test_cycle_work(struct work_struct *work)
+{
+ struct test_cycle *cycle = container_of(work, typeof(*cycle), work);
+ struct ww_acquire_ctx ctx;
+ int err;
+
+ ww_acquire_init(&ctx, &ww_class);
+ ww_mutex_lock(&cycle->a_mutex, &ctx);
+
+ complete(cycle->a_signal);
+ wait_for_completion(&cycle->b_signal);
+
+ err = ww_mutex_lock(cycle->b_mutex, &ctx);
+ if (err == -EDEADLK) {
+ ww_mutex_unlock(&cycle->a_mutex);
+ ww_mutex_lock_slow(cycle->b_mutex, &ctx);
+ err = ww_mutex_lock(&cycle->a_mutex, &ctx);
+ }
+
+ if (!err)
+ ww_mutex_unlock(cycle->b_mutex);
+ ww_mutex_unlock(&cycle->a_mutex);
+ ww_acquire_fini(&ctx);
+
+ cycle->result = err;
+}
+
+static int __test_cycle(unsigned int nthreads)
+{
+ struct test_cycle *cycles;
+ unsigned int n, last = nthreads - 1;
+ int ret;
+
+ cycles = kmalloc_array(nthreads, sizeof(*cycles), GFP_KERNEL);
+ if (!cycles)
+ return -ENOMEM;
+
+ for (n = 0; n < nthreads; n++) {
+ struct test_cycle *cycle = &cycles[n];
+
+ ww_mutex_init(&cycle->a_mutex, &ww_class);
+ if (n == last)
+ cycle->b_mutex = &cycles[0].a_mutex;
+ else
+ cycle->b_mutex = &cycles[n + 1].a_mutex;
+
+ if (n == 0)
+ cycle->a_signal = &cycles[last].b_signal;
+ else
+ cycle->a_signal = &cycles[n - 1].b_signal;
+ init_completion(&cycle->b_signal);
+
+ INIT_WORK(&cycle->work, test_cycle_work);
+ cycle->result = 0;
+ }
+
+ for (n = 0; n < nthreads; n++)
+ queue_work(wq, &cycles[n].work);
+
+ flush_workqueue(wq);
+
+ ret = 0;
+ for (n = 0; n < nthreads; n++) {
+ struct test_cycle *cycle = &cycles[n];
+
+ if (!cycle->result)
+ continue;
+
+ pr_err("cylic deadlock not resolved, ret[%d/%d] = %d\n",
+ n, nthreads, cycle->result);
+ ret = -EINVAL;
+ break;
+ }
+
+ for (n = 0; n < nthreads; n++)
+ ww_mutex_destroy(&cycles[n].a_mutex);
+ kfree(cycles);
+ return ret;
+}
+
+static int test_cycle(unsigned int ncpus)
+{
+ unsigned int n;
+ int ret;
+
+ for (n = 2; n <= ncpus + 1; n++) {
+ ret = __test_cycle(n);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+struct stress {
+ struct work_struct work;
+ struct ww_mutex *locks;
+ int nlocks;
+ int nloops;
+};
+
+static int *get_random_order(int count)
+{
+ int *order;
+ int n, r, tmp;
+
+ order = kmalloc_array(count, sizeof(*order), GFP_TEMPORARY);
+ if (!order)
+ return order;
+
+ for (n = 0; n < count; n++)
+ order[n] = n;
+
+ for (n = count - 1; n > 1; n--) {
+ r = get_random_int() % (n + 1);
+ if (r != n) {
+ tmp = order[n];
+ order[n] = order[r];
+ order[r] = tmp;
+ }
+ }
+
+ return order;
+}
+
+static void dummy_load(struct stress *stress)
+{
+ usleep_range(1000, 2000);
+}
+
+static void stress_inorder_work(struct work_struct *work)
+{
+ struct stress *stress = container_of(work, typeof(*stress), work);
+ const int nlocks = stress->nlocks;
+ struct ww_mutex *locks = stress->locks;
+ struct ww_acquire_ctx ctx;
+ int *order;
+
+ order = get_random_order(nlocks);
+ if (!order)
+ return;
+
+ ww_acquire_init(&ctx, &ww_class);
+
+ do {
+ int contended = -1;
+ int n, err;
+
+retry:
+ err = 0;
+ for (n = 0; n < nlocks; n++) {
+ if (n == contended)
+ continue;
+
+ err = ww_mutex_lock(&locks[order[n]], &ctx);
+ if (err < 0)
+ break;
+ }
+ if (!err)
+ dummy_load(stress);
+
+ if (contended > n)
+ ww_mutex_unlock(&locks[order[contended]]);
+ contended = n;
+ while (n--)
+ ww_mutex_unlock(&locks[order[n]]);
+
+ if (err == -EDEADLK) {
+ ww_mutex_lock_slow(&locks[order[contended]], &ctx);
+ goto retry;
+ }
+
+ if (err) {
+ pr_err_once("stress (%s) failed with %d\n",
+ __func__, err);
+ break;
+ }
+ } while (--stress->nloops);
+
+ ww_acquire_fini(&ctx);
+
+ kfree(order);
+ kfree(stress);
+}
+
+struct reorder_lock {
+ struct list_head link;
+ struct ww_mutex *lock;
+};
+
+static void stress_reorder_work(struct work_struct *work)
+{
+ struct stress *stress = container_of(work, typeof(*stress), work);
+ LIST_HEAD(locks);
+ struct ww_acquire_ctx ctx;
+ struct reorder_lock *ll, *ln;
+ int *order;
+ int n, err;
+
+ order = get_random_order(stress->nlocks);
+ if (!order)
+ return;
+
+ for (n = 0; n < stress->nlocks; n++) {
+ ll = kmalloc(sizeof(*ll), GFP_KERNEL);
+ if (!ll)
+ goto out;
+
+ ll->lock = &stress->locks[order[n]];
+ list_add(&ll->link, &locks);
+ }
+ kfree(order);
+ order = NULL;
+
+ ww_acquire_init(&ctx, &ww_class);
+
+ do {
+ list_for_each_entry(ll, &locks, link) {
+ err = ww_mutex_lock(ll->lock, &ctx);
+ if (!err)
+ continue;
+
+ ln = ll;
+ list_for_each_entry_continue_reverse(ln, &locks, link)
+ ww_mutex_unlock(ln->lock);
+
+ if (err != -EDEADLK) {
+ pr_err_once("stress (%s) failed with %d\n",
+ __func__, err);
+ break;
+ }
+
+ ww_mutex_lock_slow(ll->lock, &ctx);
+ list_move(&ll->link, &locks); /* restarts iteration */
+ }
+
+ dummy_load(stress);
+ list_for_each_entry(ll, &locks, link)
+ ww_mutex_unlock(ll->lock);
+ } while (--stress->nloops);
+
+ ww_acquire_fini(&ctx);
+
+out:
+ list_for_each_entry_safe(ll, ln, &locks, link)
+ kfree(ll);
+ kfree(order);
+ kfree(stress);
+}
+
+static void stress_one_work(struct work_struct *work)
+{
+ struct stress *stress = container_of(work, typeof(*stress), work);
+ const int nlocks = stress->nlocks;
+ struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks);
+ int err;
+
+ do {
+ err = ww_mutex_lock(lock, NULL);
+ if (!err) {
+ dummy_load(stress);
+ ww_mutex_unlock(lock);
+ } else {
+ pr_err_once("stress (%s) failed with %d\n",
+ __func__, err);
+ break;
+ }
+ } while (--stress->nloops);
+
+ kfree(stress);
+}
+
+#define STRESS_INORDER BIT(0)
+#define STRESS_REORDER BIT(1)
+#define STRESS_ONE BIT(2)
+#define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE)
+
+static int stress(int nlocks, int nthreads, int nloops, unsigned int flags)
+{
+ struct ww_mutex *locks;
+ int n;
+
+ locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL);
+ if (!locks)
+ return -ENOMEM;
+
+ for (n = 0; n < nlocks; n++)
+ ww_mutex_init(&locks[n], &ww_class);
+
+ for (n = 0; nthreads; n++) {
+ struct stress *stress;
+ void (*fn)(struct work_struct *work);
+
+ fn = NULL;
+ switch (n & 3) {
+ case 0:
+ if (flags & STRESS_INORDER)
+ fn = stress_inorder_work;
+ break;
+ case 1:
+ if (flags & STRESS_REORDER)
+ fn = stress_reorder_work;
+ break;
+ case 2:
+ if (flags & STRESS_ONE)
+ fn = stress_one_work;
+ break;
+ }
+
+ if (!fn)
+ continue;
+
+ stress = kmalloc(sizeof(*stress), GFP_KERNEL);
+ if (!stress)
+ break;
+
+ INIT_WORK(&stress->work, fn);
+ stress->locks = locks;
+ stress->nlocks = nlocks;
+ stress->nloops = nloops;
+
+ queue_work(wq, &stress->work);
+ nthreads--;
+ }
+
+ flush_workqueue(wq);
+
+ for (n = 0; n < nlocks; n++)
+ ww_mutex_destroy(&locks[n]);
+ kfree(locks);
+
+ return 0;
+}
+
+static int __init test_ww_mutex_init(void)
+{
+ int ncpus = num_online_cpus();
+ int ret;
+
+ wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0);
+ if (!wq)
+ return -ENOMEM;
+
+ ret = test_mutex();
+ if (ret)
+ return ret;
+
+ ret = test_aa();
+ if (ret)
+ return ret;
+
+ ret = test_abba(false);
+ if (ret)
+ return ret;
+
+ ret = test_abba(true);
+ if (ret)
+ return ret;
+
+ ret = test_cycle(ncpus);
+ if (ret)
+ return ret;
+
+ ret = stress(16, 2*ncpus, 1<<10, STRESS_INORDER);
+ if (ret)
+ return ret;
+
+ ret = stress(16, 2*ncpus, 1<<10, STRESS_REORDER);
+ if (ret)
+ return ret;
+
+ ret = stress(4096, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static void __exit test_ww_mutex_exit(void)
+{
+ destroy_workqueue(wq);
+}
+
+module_init(test_ww_mutex_init);
+module_exit(test_ww_mutex_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Intel Corporation");
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
index 536c727a56e9..9f9284f37f8d 100644
--- a/kernel/membarrier.c
+++ b/kernel/membarrier.c
@@ -16,6 +16,7 @@
#include <linux/syscalls.h>
#include <linux/membarrier.h>
+#include <linux/tick.h>
/*
* Bitmask made from a "or" of all commands within enum membarrier_cmd,
@@ -51,6 +52,9 @@
*/
SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
{
+ /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+ if (tick_nohz_full_enabled())
+ return -ENOSYS;
if (unlikely(flags))
return -EINVAL;
switch (cmd) {
diff --git a/kernel/memremap.c b/kernel/memremap.c
index b501e390bb34..9ecedc28b928 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -246,7 +246,9 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
/* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(resource_size(res), SECTION_SIZE);
+ mem_hotplug_begin();
arch_remove_memory(align_start, align_size);
+ mem_hotplug_done();
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
pgmap_radix_release(res);
dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
@@ -358,7 +360,9 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
if (error)
goto err_pfn_remap;
+ mem_hotplug_begin();
error = arch_add_memory(nid, align_start, align_size, true);
+ mem_hotplug_done();
if (error)
goto err_add_memory;
diff --git a/kernel/module.c b/kernel/module.c
index 5432dbedf8cf..e2eec4b47143 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -46,7 +46,7 @@
#include <linux/string.h>
#include <linux/mutex.h>
#include <linux/rculist.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
#include <linux/license.h>
@@ -314,8 +314,11 @@ struct load_info {
} index;
};
-/* We require a truly strong try_module_get(): 0 means failure due to
- ongoing or failed initialization etc. */
+/*
+ * We require a truly strong try_module_get(): 0 means success.
+ * Otherwise an error is returned due to ongoing or failed
+ * initialization etc.
+ */
static inline int strong_try_module_get(struct module *mod)
{
BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
@@ -331,7 +334,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag,
enum lockdep_ok lockdep_ok)
{
add_taint(flag, lockdep_ok);
- mod->taints |= (1U << flag);
+ set_bit(flag, &mod->taints);
}
/*
@@ -387,16 +390,16 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
extern const struct kernel_symbol __stop___ksymtab_gpl[];
extern const struct kernel_symbol __start___ksymtab_gpl_future[];
extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
-extern const unsigned long __start___kcrctab[];
-extern const unsigned long __start___kcrctab_gpl[];
-extern const unsigned long __start___kcrctab_gpl_future[];
+extern const s32 __start___kcrctab[];
+extern const s32 __start___kcrctab_gpl[];
+extern const s32 __start___kcrctab_gpl_future[];
#ifdef CONFIG_UNUSED_SYMBOLS
extern const struct kernel_symbol __start___ksymtab_unused[];
extern const struct kernel_symbol __stop___ksymtab_unused[];
extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
-extern const unsigned long __start___kcrctab_unused[];
-extern const unsigned long __start___kcrctab_unused_gpl[];
+extern const s32 __start___kcrctab_unused[];
+extern const s32 __start___kcrctab_unused_gpl[];
#endif
#ifndef CONFIG_MODVERSIONS
@@ -495,7 +498,7 @@ struct find_symbol_arg {
/* Output */
struct module *owner;
- const unsigned long *crc;
+ const s32 *crc;
const struct kernel_symbol *sym;
};
@@ -561,7 +564,7 @@ static bool find_symbol_in_section(const struct symsearch *syms,
* (optional) module which owns it. Needs preempt disabled or module_mutex. */
const struct kernel_symbol *find_symbol(const char *name,
struct module **owner,
- const unsigned long **crc,
+ const s32 **crc,
bool gplok,
bool warn)
{
@@ -1139,22 +1142,13 @@ static inline int module_unload_init(struct module *mod)
static size_t module_flags_taint(struct module *mod, char *buf)
{
size_t l = 0;
+ int i;
+
+ for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
+ if (taint_flags[i].module && test_bit(i, &mod->taints))
+ buf[l++] = taint_flags[i].c_true;
+ }
- if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
- buf[l++] = 'P';
- if (mod->taints & (1 << TAINT_OOT_MODULE))
- buf[l++] = 'O';
- if (mod->taints & (1 << TAINT_FORCED_MODULE))
- buf[l++] = 'F';
- if (mod->taints & (1 << TAINT_CRAP))
- buf[l++] = 'C';
- if (mod->taints & (1 << TAINT_UNSIGNED_MODULE))
- buf[l++] = 'E';
- /*
- * TAINT_FORCED_RMMOD: could be added.
- * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
- * apply to modules.
- */
return l;
}
@@ -1256,23 +1250,17 @@ static int try_to_force_load(struct module *mod, const char *reason)
}
#ifdef CONFIG_MODVERSIONS
-/* If the arch applies (non-zero) relocations to kernel kcrctab, unapply it. */
-static unsigned long maybe_relocated(unsigned long crc,
- const struct module *crc_owner)
+
+static u32 resolve_rel_crc(const s32 *crc)
{
-#ifdef ARCH_RELOCATES_KCRCTAB
- if (crc_owner == NULL)
- return crc - (unsigned long)reloc_start;
-#endif
- return crc;
+ return *(u32 *)((void *)crc + *crc);
}
static int check_version(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *symname,
struct module *mod,
- const unsigned long *crc,
- const struct module *crc_owner)
+ const s32 *crc)
{
unsigned int i, num_versions;
struct modversion_info *versions;
@@ -1290,18 +1278,25 @@ static int check_version(Elf_Shdr *sechdrs,
/ sizeof(struct modversion_info);
for (i = 0; i < num_versions; i++) {
+ u32 crcval;
+
if (strcmp(versions[i].name, symname) != 0)
continue;
- if (versions[i].crc == maybe_relocated(*crc, crc_owner))
+ if (IS_ENABLED(CONFIG_MODULE_REL_CRCS))
+ crcval = resolve_rel_crc(crc);
+ else
+ crcval = *crc;
+ if (versions[i].crc == crcval)
return 1;
- pr_debug("Found checksum %lX vs module %lX\n",
- maybe_relocated(*crc, crc_owner), versions[i].crc);
+ pr_debug("Found checksum %X vs module %lX\n",
+ crcval, versions[i].crc);
goto bad_version;
}
- pr_warn("%s: no symbol version for %s\n", mod->name, symname);
- return 0;
+ /* Broken toolchain. Warn once, then let it go.. */
+ pr_warn_once("%s: no symbol version for %s\n", mod->name, symname);
+ return 1;
bad_version:
pr_warn("%s: disagrees about version of symbol %s\n",
@@ -1313,7 +1308,7 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
unsigned int versindex,
struct module *mod)
{
- const unsigned long *crc;
+ const s32 *crc;
/*
* Since this should be found in kernel (which can't be removed), no
@@ -1327,8 +1322,7 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
}
preempt_enable();
return check_version(sechdrs, versindex,
- VMLINUX_SYMBOL_STR(module_layout), mod, crc,
- NULL);
+ VMLINUX_SYMBOL_STR(module_layout), mod, crc);
}
/* First part is kernel version, which we ignore if module has crcs. */
@@ -1346,8 +1340,7 @@ static inline int check_version(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *symname,
struct module *mod,
- const unsigned long *crc,
- const struct module *crc_owner)
+ const s32 *crc)
{
return 1;
}
@@ -1374,7 +1367,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
{
struct module *owner;
const struct kernel_symbol *sym;
- const unsigned long *crc;
+ const s32 *crc;
int err;
/*
@@ -1389,8 +1382,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
if (!sym)
goto unlock;
- if (!check_version(info->sechdrs, info->index.vers, name, mod, crc,
- owner)) {
+ if (!check_version(info->sechdrs, info->index.vers, name, mod, crc)) {
sym = ERR_PTR(-EINVAL);
goto getname;
}
@@ -1909,6 +1901,9 @@ static void frob_writable_data(const struct module_layout *layout,
/* livepatching wants to disable read-only so it can frob module. */
void module_disable_ro(const struct module *mod)
{
+ if (!rodata_enabled)
+ return;
+
frob_text(&mod->core_layout, set_memory_rw);
frob_rodata(&mod->core_layout, set_memory_rw);
frob_ro_after_init(&mod->core_layout, set_memory_rw);
@@ -1918,6 +1913,9 @@ void module_disable_ro(const struct module *mod)
void module_enable_ro(const struct module *mod, bool after_init)
{
+ if (!rodata_enabled)
+ return;
+
frob_text(&mod->core_layout, set_memory_ro);
frob_rodata(&mod->core_layout, set_memory_ro);
frob_text(&mod->init_layout, set_memory_ro);
@@ -1950,6 +1948,9 @@ void set_all_modules_text_rw(void)
{
struct module *mod;
+ if (!rodata_enabled)
+ return;
+
mutex_lock(&module_mutex);
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
@@ -1966,9 +1967,18 @@ void set_all_modules_text_ro(void)
{
struct module *mod;
+ if (!rodata_enabled)
+ return;
+
mutex_lock(&module_mutex);
list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
+ /*
+ * Ignore going modules since it's possible that ro
+ * protection has already been disabled, otherwise we'll
+ * run into protection faults at module deallocation.
+ */
+ if (mod->state == MODULE_STATE_UNFORMED ||
+ mod->state == MODULE_STATE_GOING)
continue;
frob_text(&mod->core_layout, set_memory_ro);
@@ -1979,10 +1989,12 @@ void set_all_modules_text_ro(void)
static void disable_ro_nx(const struct module_layout *layout)
{
- frob_text(layout, set_memory_rw);
- frob_rodata(layout, set_memory_rw);
+ if (rodata_enabled) {
+ frob_text(layout, set_memory_rw);
+ frob_rodata(layout, set_memory_rw);
+ frob_ro_after_init(layout, set_memory_rw);
+ }
frob_rodata(layout, set_memory_x);
- frob_ro_after_init(layout, set_memory_rw);
frob_ro_after_init(layout, set_memory_x);
frob_writable_data(layout, set_memory_x);
}
@@ -2793,14 +2805,17 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l
}
#ifdef CONFIG_LIVEPATCH
-static int find_livepatch_modinfo(struct module *mod, struct load_info *info)
+static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
{
- mod->klp = get_modinfo(info, "livepatch") ? true : false;
+ if (get_modinfo(info, "livepatch")) {
+ mod->klp = true;
+ add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+ }
return 0;
}
#else /* !CONFIG_LIVEPATCH */
-static int find_livepatch_modinfo(struct module *mod, struct load_info *info)
+static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
{
if (get_modinfo(info, "livepatch")) {
pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
@@ -2970,7 +2985,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
"is unknown, you have been warned.\n", mod->name);
}
- err = find_livepatch_modinfo(mod, info);
+ err = check_modinfo_livepatch(mod, info);
if (err)
return err;
@@ -3706,6 +3721,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
sysfs_cleanup:
mod_sysfs_teardown(mod);
coming_cleanup:
+ mod->state = MODULE_STATE_GOING;
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
klp_module_going(mod);
@@ -4039,6 +4055,10 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
}
#endif /* CONFIG_KALLSYMS */
+/* Maximum number of characters written by module_flags() */
+#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
+
+/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
static char *module_flags(struct module *mod, char *buf)
{
int bx = 0;
@@ -4083,7 +4103,7 @@ static void m_stop(struct seq_file *m, void *p)
static int m_show(struct seq_file *m, void *p)
{
struct module *mod = list_entry(p, struct module, list);
- char buf[8];
+ char buf[MODULE_FLAGS_BUF_SIZE];
/* We always ignore unformed modules. */
if (mod->state == MODULE_STATE_UNFORMED)
@@ -4254,7 +4274,7 @@ EXPORT_SYMBOL_GPL(__module_text_address);
void print_modules(void)
{
struct module *mod;
- char buf[8];
+ char buf[MODULE_FLAGS_BUF_SIZE];
printk(KERN_DEFAULT "Modules linked in:");
/* Most callers should already have preempt disabled, but make sure */
diff --git a/kernel/padata.c b/kernel/padata.c
index 993278895ccc..05316c9f32da 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -30,6 +30,7 @@
#include <linux/slab.h>
#include <linux/sysfs.h>
#include <linux/rcupdate.h>
+#include <linux/module.h>
#define MAX_OBJ_NUM 1000
@@ -63,15 +64,11 @@ static int padata_cpu_hash(struct parallel_data *pd)
static void padata_parallel_worker(struct work_struct *parallel_work)
{
struct padata_parallel_queue *pqueue;
- struct parallel_data *pd;
- struct padata_instance *pinst;
LIST_HEAD(local_list);
local_bh_disable();
pqueue = container_of(parallel_work,
struct padata_parallel_queue, work);
- pd = pqueue->pd;
- pinst = pd->pinst;
spin_lock(&pqueue->parallel.lock);
list_replace_init(&pqueue->parallel.list, &local_list);
@@ -769,52 +766,43 @@ static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
cpumask_test_cpu(cpu, pinst->cpumask.cbcpu);
}
-
-static int padata_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int padata_cpu_online(unsigned int cpu, struct hlist_node *node)
{
- int err;
struct padata_instance *pinst;
- int cpu = (unsigned long)hcpu;
+ int ret;
- pinst = container_of(nfb, struct padata_instance, cpu_notifier);
+ pinst = hlist_entry_safe(node, struct padata_instance, node);
+ if (!pinst_has_cpu(pinst, cpu))
+ return 0;
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- if (!pinst_has_cpu(pinst, cpu))
- break;
- mutex_lock(&pinst->lock);
- err = __padata_add_cpu(pinst, cpu);
- mutex_unlock(&pinst->lock);
- if (err)
- return notifier_from_errno(err);
- break;
+ mutex_lock(&pinst->lock);
+ ret = __padata_add_cpu(pinst, cpu);
+ mutex_unlock(&pinst->lock);
+ return ret;
+}
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- if (!pinst_has_cpu(pinst, cpu))
- break;
- mutex_lock(&pinst->lock);
- err = __padata_remove_cpu(pinst, cpu);
- mutex_unlock(&pinst->lock);
- if (err)
- return notifier_from_errno(err);
- break;
- }
+static int padata_cpu_prep_down(unsigned int cpu, struct hlist_node *node)
+{
+ struct padata_instance *pinst;
+ int ret;
- return NOTIFY_OK;
+ pinst = hlist_entry_safe(node, struct padata_instance, node);
+ if (!pinst_has_cpu(pinst, cpu))
+ return 0;
+
+ mutex_lock(&pinst->lock);
+ ret = __padata_remove_cpu(pinst, cpu);
+ mutex_unlock(&pinst->lock);
+ return ret;
}
+
+static enum cpuhp_state hp_online;
#endif
static void __padata_free(struct padata_instance *pinst)
{
#ifdef CONFIG_HOTPLUG_CPU
- unregister_hotcpu_notifier(&pinst->cpu_notifier);
+ cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node);
#endif
padata_stop(pinst);
@@ -1012,11 +1000,8 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq,
mutex_init(&pinst->lock);
#ifdef CONFIG_HOTPLUG_CPU
- pinst->cpu_notifier.notifier_call = padata_cpu_callback;
- pinst->cpu_notifier.priority = 0;
- register_hotcpu_notifier(&pinst->cpu_notifier);
+ cpuhp_state_add_instance_nocalls(hp_online, &pinst->node);
#endif
-
return pinst;
err_free_masks:
@@ -1039,3 +1024,26 @@ void padata_free(struct padata_instance *pinst)
kobject_put(&pinst->kobj);
}
EXPORT_SYMBOL(padata_free);
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static __init int padata_driver_init(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online",
+ padata_cpu_online,
+ padata_cpu_prep_down);
+ if (ret < 0)
+ return ret;
+ hp_online = ret;
+ return 0;
+}
+module_init(padata_driver_init);
+
+static __exit void padata_driver_exit(void)
+{
+ cpuhp_remove_multi_state(hp_online);
+}
+module_exit(padata_driver_exit);
+#endif
diff --git a/kernel/panic.c b/kernel/panic.c
index ca8cea1ef673..08aa88dde7de 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -71,6 +71,32 @@ void __weak nmi_panic_self_stop(struct pt_regs *regs)
panic_smp_self_stop();
}
+/*
+ * Stop other CPUs in panic. Architecture dependent code may override this
+ * with more suitable version. For example, if the architecture supports
+ * crash dump, it should save registers of each stopped CPU and disable
+ * per-CPU features such as virtualization extensions.
+ */
+void __weak crash_smp_send_stop(void)
+{
+ static int cpus_stopped;
+
+ /*
+ * This function can be called twice in panic path, but obviously
+ * we execute this only once.
+ */
+ if (cpus_stopped)
+ return;
+
+ /*
+ * Note smp_send_stop is the usual smp shutdown function, which
+ * unfortunately means it may not be hardened to work in a panic
+ * situation.
+ */
+ smp_send_stop();
+ cpus_stopped = 1;
+}
+
atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
/*
@@ -164,14 +190,21 @@ void panic(const char *fmt, ...)
if (!_crash_kexec_post_notifiers) {
printk_nmi_flush_on_panic();
__crash_kexec(NULL);
- }
- /*
- * Note smp_send_stop is the usual smp shutdown function, which
- * unfortunately means it may not be hardened to work in a panic
- * situation.
- */
- smp_send_stop();
+ /*
+ * Note smp_send_stop is the usual smp shutdown function, which
+ * unfortunately means it may not be hardened to work in a
+ * panic situation.
+ */
+ smp_send_stop();
+ } else {
+ /*
+ * If we want to do crash dump after notifier calls and
+ * kmsg_dump, we will need architecture dependent extra
+ * works in addition to stopping other CPUs.
+ */
+ crash_smp_send_stop();
+ }
/*
* Run any panic handlers, including those that might need to
@@ -216,7 +249,7 @@ void panic(const char *fmt, ...)
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
- pr_emerg("Rebooting in %d seconds..", panic_timeout);
+ pr_emerg("Rebooting in %d seconds..\n", panic_timeout);
for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
@@ -265,30 +298,27 @@ void panic(const char *fmt, ...)
EXPORT_SYMBOL(panic);
-
-struct tnt {
- u8 bit;
- char true;
- char false;
-};
-
-static const struct tnt tnts[] = {
- { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
- { TAINT_FORCED_MODULE, 'F', ' ' },
- { TAINT_CPU_OUT_OF_SPEC, 'S', ' ' },
- { TAINT_FORCED_RMMOD, 'R', ' ' },
- { TAINT_MACHINE_CHECK, 'M', ' ' },
- { TAINT_BAD_PAGE, 'B', ' ' },
- { TAINT_USER, 'U', ' ' },
- { TAINT_DIE, 'D', ' ' },
- { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
- { TAINT_WARN, 'W', ' ' },
- { TAINT_CRAP, 'C', ' ' },
- { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
- { TAINT_OOT_MODULE, 'O', ' ' },
- { TAINT_UNSIGNED_MODULE, 'E', ' ' },
- { TAINT_SOFTLOCKUP, 'L', ' ' },
- { TAINT_LIVEPATCH, 'K', ' ' },
+/*
+ * TAINT_FORCED_RMMOD could be a per-module flag but the module
+ * is being removed anyway.
+ */
+const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
+ { 'P', 'G', true }, /* TAINT_PROPRIETARY_MODULE */
+ { 'F', ' ', true }, /* TAINT_FORCED_MODULE */
+ { 'S', ' ', false }, /* TAINT_CPU_OUT_OF_SPEC */
+ { 'R', ' ', false }, /* TAINT_FORCED_RMMOD */
+ { 'M', ' ', false }, /* TAINT_MACHINE_CHECK */
+ { 'B', ' ', false }, /* TAINT_BAD_PAGE */
+ { 'U', ' ', false }, /* TAINT_USER */
+ { 'D', ' ', false }, /* TAINT_DIE */
+ { 'A', ' ', false }, /* TAINT_OVERRIDDEN_ACPI_TABLE */
+ { 'W', ' ', false }, /* TAINT_WARN */
+ { 'C', ' ', true }, /* TAINT_CRAP */
+ { 'I', ' ', false }, /* TAINT_FIRMWARE_WORKAROUND */
+ { 'O', ' ', true }, /* TAINT_OOT_MODULE */
+ { 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */
+ { 'L', ' ', false }, /* TAINT_SOFTLOCKUP */
+ { 'K', ' ', true }, /* TAINT_LIVEPATCH */
};
/**
@@ -315,17 +345,17 @@ static const struct tnt tnts[] = {
*/
const char *print_tainted(void)
{
- static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")];
+ static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")];
if (tainted_mask) {
char *s;
int i;
s = buf + sprintf(buf, "Tainted: ");
- for (i = 0; i < ARRAY_SIZE(tnts); i++) {
- const struct tnt *t = &tnts[i];
- *s++ = test_bit(t->bit, &tainted_mask) ?
- t->true : t->false;
+ for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
+ const struct taint_flag *t = &taint_flags[i];
+ *s++ = test_bit(i, &tainted_mask) ?
+ t->c_true : t->c_false;
}
*s = 0;
} else
diff --git a/kernel/pid.c b/kernel/pid.c
index f66162f2359b..0291804151b5 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -68,9 +68,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
* the scheme scales to up to 4 million PIDs, runtime.
*/
struct pid_namespace init_pid_ns = {
- .kref = {
- .refcount = ATOMIC_INIT(2),
- },
+ .kref = KREF_INIT(2),
.pidmap = {
[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
},
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a65ba137fd15..eef2ce968636 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -79,23 +79,36 @@ static void proc_cleanup_work(struct work_struct *work)
/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
#define MAX_PID_NS_LEVEL 32
+static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
+}
+
+static void dec_pid_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
+}
+
static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
struct pid_namespace *parent_pid_ns)
{
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
+ struct ucounts *ucounts;
int i;
int err;
- if (level > MAX_PID_NS_LEVEL) {
- err = -EINVAL;
+ err = -ENOSPC;
+ if (level > MAX_PID_NS_LEVEL)
+ goto out;
+ ucounts = inc_pid_namespaces(user_ns);
+ if (!ucounts)
goto out;
- }
err = -ENOMEM;
ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
if (ns == NULL)
- goto out;
+ goto out_dec;
ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
if (!ns->pidmap[0].page)
@@ -114,6 +127,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
+ ns->ucounts = ucounts;
ns->nr_hashed = PIDNS_HASH_ADDING;
INIT_WORK(&ns->proc_work, proc_cleanup_work);
@@ -129,14 +143,20 @@ out_free_map:
kfree(ns->pidmap[0].page);
out_free:
kmem_cache_free(pid_ns_cachep, ns);
+out_dec:
+ dec_pid_namespaces(ucounts);
out:
return ERR_PTR(err);
}
static void delayed_free_pidns(struct rcu_head *p)
{
- kmem_cache_free(pid_ns_cachep,
- container_of(p, struct pid_namespace, rcu));
+ struct pid_namespace *ns = container_of(p, struct pid_namespace, rcu);
+
+ dec_pid_namespaces(ns->ucounts);
+ put_user_ns(ns->user_ns);
+
+ kmem_cache_free(pid_ns_cachep, ns);
}
static void destroy_pid_namespace(struct pid_namespace *ns)
@@ -146,7 +166,6 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
ns_free_inum(&ns->ns);
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
- put_user_ns(ns->user_ns);
call_rcu(&ns->rcu, delayed_free_pidns);
}
@@ -388,12 +407,37 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return 0;
}
+static struct ns_common *pidns_get_parent(struct ns_common *ns)
+{
+ struct pid_namespace *active = task_active_pid_ns(current);
+ struct pid_namespace *pid_ns, *p;
+
+ /* See if the parent is in the current namespace */
+ pid_ns = p = to_pid_ns(ns)->parent;
+ for (;;) {
+ if (!p)
+ return ERR_PTR(-EPERM);
+ if (p == active)
+ break;
+ p = p->parent;
+ }
+
+ return &get_pid_ns(pid_ns)->ns;
+}
+
+static struct user_namespace *pidns_owner(struct ns_common *ns)
+{
+ return to_pid_ns(ns)->user_ns;
+}
+
const struct proc_ns_operations pidns_operations = {
.name = "pid",
.type = CLONE_NEWPID,
.get = pidns_get,
.put = pidns_put,
.install = pidns_install,
+ .owner = pidns_owner,
+ .get_parent = pidns_get_parent,
};
static __init int pid_namespaces_init(void)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 68d3ebc12601..e8517b63eb37 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -186,7 +186,7 @@ config PM_SLEEP_DEBUG
config DPM_WATCHDOG
bool "Device suspend/resume watchdog"
- depends on PM_DEBUG && PSTORE
+ depends on PM_DEBUG && PSTORE && EXPERT
---help---
Sets up a watchdog timer to capture drivers that are
locked up attempting to suspend/resume a device.
@@ -197,7 +197,7 @@ config DPM_WATCHDOG
config DPM_WATCHDOG_TIMEOUT
int "Watchdog timeout in seconds"
range 1 120
- default 60
+ default 120
depends on DPM_WATCHDOG
config PM_TRACE
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 33c79b6105c5..b26dbc48c75b 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -306,8 +306,10 @@ static int create_image(int platform_mode)
if (error)
printk(KERN_ERR "PM: Error %d creating hibernation image\n",
error);
- if (!in_suspend)
+ if (!in_suspend) {
events_check_enabled = false;
+ clear_free_pages();
+ }
platform_leave(platform_mode);
@@ -1189,22 +1191,6 @@ static int __init nohibernate_setup(char *str)
return 1;
}
-static int __init page_poison_nohibernate_setup(char *str)
-{
-#ifdef CONFIG_PAGE_POISONING_ZERO
- /*
- * The zeroing option for page poison skips the checks on alloc.
- * since hibernation doesn't save free pages there's no way to
- * guarantee the pages will still be zeroed.
- */
- if (!strcmp(str, "on")) {
- pr_info("Disabling hibernation due to page poisoning\n");
- return nohibernate_setup(str);
- }
-#endif
- return 1;
-}
-
__setup("noresume", noresume_setup);
__setup("resume_offset=", resume_offset_setup);
__setup("resume=", resume_setup);
@@ -1212,4 +1198,3 @@ __setup("hibernate=", hibernate_setup);
__setup("resumewait", resumewait_setup);
__setup("resumedelay=", resumedelay_setup);
__setup("nohibernate", nohibernate_setup);
-__setup("page_poison=", page_poison_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 5ea50b1b7595..d401c21136d1 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -78,6 +78,78 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
power_attr(pm_async);
+#ifdef CONFIG_SUSPEND
+static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ char *s = buf;
+ suspend_state_t i;
+
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+ if (mem_sleep_states[i]) {
+ const char *label = mem_sleep_states[i];
+
+ if (mem_sleep_current == i)
+ s += sprintf(s, "[%s] ", label);
+ else
+ s += sprintf(s, "%s ", label);
+ }
+
+ /* Convert the last space to a newline if needed. */
+ if (s != buf)
+ *(s-1) = '\n';
+
+ return (s - buf);
+}
+
+static suspend_state_t decode_suspend_state(const char *buf, size_t n)
+{
+ suspend_state_t state;
+ char *p;
+ int len;
+
+ p = memchr(buf, '\n', n);
+ len = p ? p - buf : n;
+
+ for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) {
+ const char *label = mem_sleep_states[state];
+
+ if (label && len == strlen(label) && !strncmp(buf, label, len))
+ return state;
+ }
+
+ return PM_SUSPEND_ON;
+}
+
+static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ suspend_state_t state;
+ int error;
+
+ error = pm_autosleep_lock();
+ if (error)
+ return error;
+
+ if (pm_autosleep_state() > PM_SUSPEND_ON) {
+ error = -EBUSY;
+ goto out;
+ }
+
+ state = decode_suspend_state(buf, n);
+ if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON)
+ mem_sleep_current = state;
+ else
+ error = -EINVAL;
+
+ out:
+ pm_autosleep_unlock();
+ return error ? error : n;
+}
+
+power_attr(mem_sleep);
+#endif /* CONFIG_SUSPEND */
+
#ifdef CONFIG_PM_DEBUG
int pm_test_level = TEST_NONE;
@@ -368,12 +440,16 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
}
state = decode_state(buf, n);
- if (state < PM_SUSPEND_MAX)
+ if (state < PM_SUSPEND_MAX) {
+ if (state == PM_SUSPEND_MEM)
+ state = mem_sleep_current;
+
error = pm_suspend(state);
- else if (state == PM_SUSPEND_MAX)
+ } else if (state == PM_SUSPEND_MAX) {
error = hibernate();
- else
+ } else {
error = -EINVAL;
+ }
out:
pm_autosleep_unlock();
@@ -485,6 +561,9 @@ static ssize_t autosleep_store(struct kobject *kobj,
&& strcmp(buf, "off") && strcmp(buf, "off\n"))
return -EINVAL;
+ if (state == PM_SUSPEND_MEM)
+ state = mem_sleep_current;
+
error = pm_autosleep_set_state(state);
return error ? error : n;
}
@@ -602,6 +681,9 @@ static struct attribute * g[] = {
#ifdef CONFIG_PM_SLEEP
&pm_async_attr.attr,
&wakeup_count_attr.attr,
+#ifdef CONFIG_SUSPEND
+ &mem_sleep_attr.attr,
+#endif
#ifdef CONFIG_PM_AUTOSLEEP
&autosleep_attr.attr,
#endif
@@ -644,6 +726,7 @@ static int __init pm_init(void)
return error;
hibernate_image_size_init();
hibernate_reserved_size_init();
+ pm_states_init();
power_kobj = kobject_create_and_add("power", NULL);
if (!power_kobj)
return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 242d8b827dd5..1dfa0da827d3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -110,6 +110,8 @@ extern int create_basic_memory_bitmaps(void);
extern void free_basic_memory_bitmaps(void);
extern int hibernate_preallocate_memory(void);
+extern void clear_free_pages(void);
+
/**
* Auxiliary structure used for reading the snapshot image data and
* metadata from and writing them to the list of page backup entries
@@ -187,11 +189,15 @@ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
#ifdef CONFIG_SUSPEND
/* kernel/power/suspend.c */
-extern const char *pm_labels[];
+extern const char * const pm_labels[];
extern const char *pm_states[];
+extern const char *mem_sleep_states[];
+extern suspend_state_t mem_sleep_current;
extern int suspend_devices_and_enter(suspend_state_t state);
#else /* !CONFIG_SUSPEND */
+#define mem_sleep_current PM_SUSPEND_ON
+
static inline int suspend_devices_and_enter(suspend_state_t state)
{
return -ENOSYS;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 8f27d5a8adf6..2fba066e125f 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -144,23 +144,12 @@ int freeze_processes(void)
/*
* Now that the whole userspace is frozen we need to disbale
* the OOM killer to disallow any further interference with
- * killable tasks.
+ * killable tasks. There is no guarantee oom victims will
+ * ever reach a point they go away we have to wait with a timeout.
*/
- if (!error && !oom_killer_disable())
+ if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs)))
error = -EBUSY;
- /*
- * There is a hard to fix race between oom_reaper kernel thread
- * and oom_killer_disable. oom_reaper calls exit_oom_victim
- * before the victim reaches exit_mm so try to freeze all the tasks
- * again and catch such a left over task.
- */
- if (!error) {
- pr_info("Double checking all user space processes after OOM killer disable... ");
- error = try_to_freeze_tasks(true);
- pr_cont("\n");
- }
-
if (error)
thaw_processes();
return error;
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 168ff442ebde..97b0df71303e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -482,16 +482,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
return;
}
- /*
- * This function may be called very early during boot, for example,
- * from of_clk_init(), where irq needs to stay disabled.
- * cancel_delayed_work_sync() assumes that irq is enabled on
- * invocation and re-enables it on return. Avoid calling it until
- * workqueue is initialized.
- */
- if (keventd_up())
- cancel_delayed_work_sync(&req->work);
-
+ cancel_delayed_work_sync(&req->work);
__pm_qos_update_request(req, new_value);
}
EXPORT_SYMBOL_GPL(pm_qos_update_request);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b02228411d57..2d8e2b227db8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -30,7 +30,7 @@
#include <linux/compiler.h>
#include <linux/ktime.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -1132,6 +1132,28 @@ void free_basic_memory_bitmaps(void)
pr_debug("PM: Basic memory bitmaps freed\n");
}
+void clear_free_pages(void)
+{
+#ifdef CONFIG_PAGE_POISONING_ZERO
+ struct memory_bitmap *bm = free_pages_map;
+ unsigned long pfn;
+
+ if (WARN_ON(!(free_pages_map)))
+ return;
+
+ memory_bm_position_reset(bm);
+ pfn = memory_bm_next_pfn(bm);
+ while (pfn != BM_END_OF_MAP) {
+ if (pfn_valid(pfn))
+ clear_highpage(pfn_to_page(pfn));
+
+ pfn = memory_bm_next_pfn(bm);
+ }
+ memory_bm_position_reset(bm);
+ pr_info("PM: free pages cleared after restore\n");
+#endif /* PAGE_POISONING_ZERO */
+}
+
/**
* snapshot_additional_pages - Estimate the number of extra pages needed.
* @zone: Memory zone to carry out the computation for.
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 0acab9d7f96f..15e6baef5c73 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -32,8 +32,21 @@
#include "power.h"
-const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
+const char * const pm_labels[] = {
+ [PM_SUSPEND_FREEZE] = "freeze",
+ [PM_SUSPEND_STANDBY] = "standby",
+ [PM_SUSPEND_MEM] = "mem",
+};
const char *pm_states[PM_SUSPEND_MAX];
+static const char * const mem_sleep_labels[] = {
+ [PM_SUSPEND_FREEZE] = "s2idle",
+ [PM_SUSPEND_STANDBY] = "shallow",
+ [PM_SUSPEND_MEM] = "deep",
+};
+const char *mem_sleep_states[PM_SUSPEND_MAX];
+
+suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE;
+static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM;
unsigned int pm_suspend_global_flags;
EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
@@ -110,22 +123,32 @@ static bool valid_state(suspend_state_t state)
return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
}
-/*
- * If this is set, the "mem" label always corresponds to the deepest sleep state
- * available, the "standby" label corresponds to the second deepest sleep state
- * available (if any), and the "freeze" label corresponds to the remaining
- * available sleep state (if there is one).
- */
-static bool relative_states;
+void __init pm_states_init(void)
+{
+ /* "mem" and "freeze" are always present in /sys/power/state. */
+ pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM];
+ pm_states[PM_SUSPEND_FREEZE] = pm_labels[PM_SUSPEND_FREEZE];
+ /*
+ * Suspend-to-idle should be supported even without any suspend_ops,
+ * initialize mem_sleep_states[] accordingly here.
+ */
+ mem_sleep_states[PM_SUSPEND_FREEZE] = mem_sleep_labels[PM_SUSPEND_FREEZE];
+}
-static int __init sleep_states_setup(char *str)
+static int __init mem_sleep_default_setup(char *str)
{
- relative_states = !strncmp(str, "1", 1);
- pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
+ suspend_state_t state;
+
+ for (state = PM_SUSPEND_FREEZE; state <= PM_SUSPEND_MEM; state++)
+ if (mem_sleep_labels[state] &&
+ !strcmp(str, mem_sleep_labels[state])) {
+ mem_sleep_default = state;
+ break;
+ }
+
return 1;
}
-
-__setup("relative_sleep_states=", sleep_states_setup);
+__setup("mem_sleep_default=", mem_sleep_default_setup);
/**
* suspend_set_ops - Set the global suspend method table.
@@ -133,21 +156,21 @@ __setup("relative_sleep_states=", sleep_states_setup);
*/
void suspend_set_ops(const struct platform_suspend_ops *ops)
{
- suspend_state_t i;
- int j = 0;
-
lock_system_sleep();
suspend_ops = ops;
- for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
- if (valid_state(i)) {
- pm_states[i] = pm_labels[j++];
- } else if (!relative_states) {
- pm_states[i] = NULL;
- j++;
- }
- pm_states[PM_SUSPEND_FREEZE] = pm_labels[j];
+ if (valid_state(PM_SUSPEND_STANDBY)) {
+ mem_sleep_states[PM_SUSPEND_STANDBY] = mem_sleep_labels[PM_SUSPEND_STANDBY];
+ pm_states[PM_SUSPEND_STANDBY] = pm_labels[PM_SUSPEND_STANDBY];
+ if (mem_sleep_default == PM_SUSPEND_STANDBY)
+ mem_sleep_current = PM_SUSPEND_STANDBY;
+ }
+ if (valid_state(PM_SUSPEND_MEM)) {
+ mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM];
+ if (mem_sleep_default == PM_SUSPEND_MEM)
+ mem_sleep_current = PM_SUSPEND_MEM;
+ }
unlock_system_sleep();
}
@@ -211,7 +234,7 @@ static int platform_suspend_begin(suspend_state_t state)
{
if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
return freeze_ops->begin();
- else if (suspend_ops->begin)
+ else if (suspend_ops && suspend_ops->begin)
return suspend_ops->begin(state);
else
return 0;
@@ -221,7 +244,7 @@ static void platform_resume_end(suspend_state_t state)
{
if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
freeze_ops->end();
- else if (suspend_ops->end)
+ else if (suspend_ops && suspend_ops->end)
suspend_ops->end();
}
@@ -490,9 +513,9 @@ static int enter_state(suspend_state_t state)
#ifndef CONFIG_SUSPEND_SKIP_SYNC
trace_suspend_resume(TPS("sync_filesystems"), 0, true);
- printk(KERN_INFO "PM: Syncing filesystems ... ");
+ pr_info("PM: Syncing filesystems ... ");
sys_sync();
- printk("done.\n");
+ pr_cont("done.\n");
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
#endif
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 084452e34a12..5db217051232 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -166,7 +166,7 @@ static int __init setup_test_suspend(char *value)
return 0;
}
- for (i = 0; pm_labels[i]; i++)
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
if (!strcmp(pm_labels[i], suspend_type)) {
test_state_label = pm_labels[i];
return 0;
@@ -203,8 +203,10 @@ static int __init test_suspend(void)
/* RTCs have initialized by now too ... can we use one? */
dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
- if (dev)
+ if (dev) {
rtc = rtc_class_open(dev_name(dev));
+ put_device(dev);
+ }
if (!rtc) {
printk(warn_no_rtc);
return 0;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a3b1e617bcdc..f80fd33639e0 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -201,7 +201,7 @@ void free_all_swap_pages(int swap)
struct swsusp_extent *ext;
unsigned long offset;
- ext = container_of(node, struct swsusp_extent, node);
+ ext = rb_entry(node, struct swsusp_extent, node);
rb_erase(node, &swsusp_extents);
for (offset = ext->start; offset <= ext->end; offset++)
swap_free(swp_entry(swap, offset));
@@ -307,7 +307,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
{
int error;
- hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
+ hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block,
swsusp_header, NULL);
if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
!memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
@@ -317,7 +317,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
swsusp_header->flags = flags;
if (flags & SF_CRC32_MODE)
swsusp_header->crc32 = handle->crc32;
- error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+ error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
swsusp_resume_block, swsusp_header, NULL);
} else {
printk(KERN_ERR "PM: Swap header not found!\n");
@@ -397,7 +397,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
} else {
src = buf;
}
- return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb);
+ return hib_submit_io(REQ_OP_WRITE, REQ_SYNC, offset, src, hb);
}
static void release_swap_writer(struct swap_map_handle *handle)
@@ -1000,8 +1000,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
return -ENOMEM;
}
- error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset,
- tmp->map, NULL);
+ error = hib_submit_io(REQ_OP_READ, 0, offset, tmp->map, NULL);
if (error) {
release_swap_reader(handle);
return error;
@@ -1025,7 +1024,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
offset = handle->cur->entries[handle->k];
if (!offset)
return -EFAULT;
- error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb);
+ error = hib_submit_io(REQ_OP_READ, 0, offset, buf, hb);
if (error)
return error;
if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -1534,7 +1533,7 @@ int swsusp_check(void)
if (!IS_ERR(hib_resume_bdev)) {
set_blocksize(hib_resume_bdev, PAGE_SIZE);
clear_page(swsusp_header);
- error = hib_submit_io(REQ_OP_READ, READ_SYNC,
+ error = hib_submit_io(REQ_OP_READ, 0,
swsusp_resume_block,
swsusp_header, NULL);
if (error)
@@ -1543,7 +1542,7 @@ int swsusp_check(void)
if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
/* Reset swap signature now */
- error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+ error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
swsusp_resume_block,
swsusp_header, NULL);
} else {
@@ -1588,11 +1587,11 @@ int swsusp_unmark(void)
{
int error;
- hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
+ hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block,
swsusp_header, NULL);
if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
- error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+ error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
swsusp_resume_block,
swsusp_header, NULL);
} else {
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 35310b627388..22df9f7ff672 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -25,7 +25,7 @@
#include <linux/cpu.h>
#include <linux/freezer.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "power.h"
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index 16bab471c7e2..f011aaef583c 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -67,7 +67,8 @@ static int vprintk_nmi(const char *fmt, va_list args)
again:
len = atomic_read(&s->len);
- if (len >= sizeof(s->buffer)) {
+ /* The trailing '\0' is not counted into len. */
+ if (len >= sizeof(s->buffer) - 1) {
atomic_inc(&nmi_message_lost);
return 0;
}
@@ -79,7 +80,7 @@ again:
if (!len)
smp_rmb();
- add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+ add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
/*
* Do it once again if the buffer has been flushed in the meantime.
@@ -113,16 +114,51 @@ static void printk_nmi_flush_line(const char *text, int len)
}
-/*
- * printk one line from the temporary buffer from @start index until
- * and including the @end index.
- */
-static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s,
- int start, int end)
+/* printk part of the temporary buffer line by line */
+static int printk_nmi_flush_buffer(const char *start, size_t len)
{
- const char *buf = s->buffer + start;
+ const char *c, *end;
+ bool header;
+
+ c = start;
+ end = start + len;
+ header = true;
+
+ /* Print line by line. */
+ while (c < end) {
+ if (*c == '\n') {
+ printk_nmi_flush_line(start, c - start + 1);
+ start = ++c;
+ header = true;
+ continue;
+ }
+
+ /* Handle continuous lines or missing new line. */
+ if ((c + 1 < end) && printk_get_level(c)) {
+ if (header) {
+ c = printk_skip_level(c);
+ continue;
+ }
+
+ printk_nmi_flush_line(start, c - start);
+ start = c++;
+ header = true;
+ continue;
+ }
+
+ header = false;
+ c++;
+ }
- printk_nmi_flush_line(buf, (end - start) + 1);
+ /* Check if there was a partial line. Ignore pure header. */
+ if (start < end && !header) {
+ static const char newline[] = KERN_CONT "\n";
+
+ printk_nmi_flush_line(start, end - start);
+ printk_nmi_flush_line(newline, strlen(newline));
+ }
+
+ return len;
}
/*
@@ -135,8 +171,8 @@ static void __printk_nmi_flush(struct irq_work *work)
__RAW_SPIN_LOCK_INITIALIZER(read_lock);
struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
unsigned long flags;
- size_t len, size;
- int i, last_i;
+ size_t len;
+ int i;
/*
* The lock has two functions. First, one reader has to flush all
@@ -154,12 +190,14 @@ more:
/*
* This is just a paranoid check that nobody has manipulated
* the buffer an unexpected way. If we printed something then
- * @len must only increase.
+ * @len must only increase. Also it should never overflow the
+ * buffer size.
*/
- if (i && i >= len) {
+ if ((i && i >= len) || len > sizeof(s->buffer)) {
const char *msg = "printk_nmi_flush: internal error\n";
printk_nmi_flush_line(msg, strlen(msg));
+ len = 0;
}
if (!len)
@@ -167,22 +205,7 @@ more:
/* Make sure that data has been written up to the @len */
smp_rmb();
-
- size = min(len, sizeof(s->buffer));
- last_i = i;
-
- /* Print line by line. */
- for (; i < size; i++) {
- if (s->buffer[i] == '\n') {
- printk_nmi_flush_seq_line(s, last_i, i);
- last_i = i + 1;
- }
- }
- /* Check if there was a partial line. */
- if (last_i < size) {
- printk_nmi_flush_seq_line(s, last_i, size - 1);
- printk_nmi_flush_line("\n", strlen("\n"));
- }
+ i += printk_nmi_flush_buffer(s->buffer + i, len - i);
/*
* Check that nothing has got added in the meantime and truncate
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index eea6dbc2d8cf..4ba3d34938c0 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -46,7 +46,7 @@
#include <linux/ctype.h>
#include <linux/uio.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/sections.h>
#define CREATE_TRACE_POINTS
@@ -356,7 +356,6 @@ DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static u32 syslog_idx;
-static enum log_flags syslog_prev;
static size_t syslog_partial;
/* index and sequence number of the first record stored in the buffer */
@@ -370,7 +369,6 @@ static u32 log_next_idx;
/* the next printk record to write to the console */
static u64 console_seq;
static u32 console_idx;
-static enum log_flags console_prev;
/* the next printk record to read after the last 'clear' command */
static u64 clear_seq;
@@ -639,30 +637,15 @@ static void append_char(char **pp, char *e, char c)
}
static ssize_t msg_print_ext_header(char *buf, size_t size,
- struct printk_log *msg, u64 seq,
- enum log_flags prev_flags)
+ struct printk_log *msg, u64 seq)
{
u64 ts_usec = msg->ts_nsec;
- char cont = '-';
do_div(ts_usec, 1000);
- /*
- * If we couldn't merge continuation line fragments during the print,
- * export the stored flags to allow an optional external merge of the
- * records. Merging the records isn't always neccessarily correct, like
- * when we hit a race during printing. In most cases though, it produces
- * better readable output. 'c' in the record flags mark the first
- * fragment of a line, '+' the following.
- */
- if (msg->flags & LOG_CONT && !(prev_flags & LOG_CONT))
- cont = 'c';
- else if ((msg->flags & LOG_CONT) ||
- ((prev_flags & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
- cont = '+';
-
return scnprintf(buf, size, "%u,%llu,%llu,%c;",
- (msg->facility << 3) | msg->level, seq, ts_usec, cont);
+ (msg->facility << 3) | msg->level, seq, ts_usec,
+ msg->flags & LOG_CONT ? 'c' : '-');
}
static ssize_t msg_print_ext_body(char *buf, size_t size,
@@ -717,7 +700,6 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
struct devkmsg_user {
u64 seq;
u32 idx;
- enum log_flags prev;
struct ratelimit_state rs;
struct mutex lock;
char buf[CONSOLE_EXT_LOG_MAX];
@@ -751,7 +733,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
return -ENOMEM;
buf[len] = '\0';
- if (copy_from_iter(buf, len, from) != len) {
+ if (!copy_from_iter_full(buf, len, from)) {
kfree(buf);
return -EFAULT;
}
@@ -827,12 +809,11 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
msg = log_from_idx(user->idx);
len = msg_print_ext_header(user->buf, sizeof(user->buf),
- msg, user->seq, user->prev);
+ msg, user->seq);
len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
log_dict(msg), msg->dict_len,
log_text(msg), msg->text_len);
- user->prev = msg->flags;
user->idx = log_next(user->idx);
user->seq++;
raw_spin_unlock_irq(&logbuf_lock);
@@ -1213,26 +1194,12 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf)
return len;
}
-static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
- bool syslog, char *buf, size_t size)
+static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size)
{
const char *text = log_text(msg);
size_t text_size = msg->text_len;
- bool prefix = true;
- bool newline = true;
size_t len = 0;
- if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
- prefix = false;
-
- if (msg->flags & LOG_CONT) {
- if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
- prefix = false;
-
- if (!(msg->flags & LOG_NEWLINE))
- newline = false;
- }
-
do {
const char *next = memchr(text, '\n', text_size);
size_t text_len;
@@ -1250,22 +1217,17 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
text_len + 1 >= size - len)
break;
- if (prefix)
- len += print_prefix(msg, syslog, buf + len);
+ len += print_prefix(msg, syslog, buf + len);
memcpy(buf + len, text, text_len);
len += text_len;
- if (next || newline)
- buf[len++] = '\n';
+ buf[len++] = '\n';
} else {
/* SYSLOG_ACTION_* buffer size only calculation */
- if (prefix)
- len += print_prefix(msg, syslog, NULL);
+ len += print_prefix(msg, syslog, NULL);
len += text_len;
- if (next || newline)
- len++;
+ len++;
}
- prefix = true;
text = next;
} while (text);
@@ -1291,7 +1253,6 @@ static int syslog_print(char __user *buf, int size)
/* messages are gone, move to first one */
syslog_seq = log_first_seq;
syslog_idx = log_first_idx;
- syslog_prev = 0;
syslog_partial = 0;
}
if (syslog_seq == log_next_seq) {
@@ -1301,13 +1262,11 @@ static int syslog_print(char __user *buf, int size)
skip = syslog_partial;
msg = log_from_idx(syslog_idx);
- n = msg_print_text(msg, syslog_prev, true, text,
- LOG_LINE_MAX + PREFIX_MAX);
+ n = msg_print_text(msg, true, text, LOG_LINE_MAX + PREFIX_MAX);
if (n - syslog_partial <= size) {
/* message fits into buffer, move forward */
syslog_idx = log_next(syslog_idx);
syslog_seq++;
- syslog_prev = msg->flags;
n -= syslog_partial;
syslog_partial = 0;
} else if (!len){
@@ -1350,7 +1309,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
u64 next_seq;
u64 seq;
u32 idx;
- enum log_flags prev;
/*
* Find first record that fits, including all following records,
@@ -1358,12 +1316,10 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
*/
seq = clear_seq;
idx = clear_idx;
- prev = 0;
while (seq < log_next_seq) {
struct printk_log *msg = log_from_idx(idx);
- len += msg_print_text(msg, prev, true, NULL, 0);
- prev = msg->flags;
+ len += msg_print_text(msg, true, NULL, 0);
idx = log_next(idx);
seq++;
}
@@ -1371,12 +1327,10 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
/* move first record forward until length fits into the buffer */
seq = clear_seq;
idx = clear_idx;
- prev = 0;
while (len > size && seq < log_next_seq) {
struct printk_log *msg = log_from_idx(idx);
- len -= msg_print_text(msg, prev, true, NULL, 0);
- prev = msg->flags;
+ len -= msg_print_text(msg, true, NULL, 0);
idx = log_next(idx);
seq++;
}
@@ -1389,7 +1343,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
struct printk_log *msg = log_from_idx(idx);
int textlen;
- textlen = msg_print_text(msg, prev, true, text,
+ textlen = msg_print_text(msg, true, text,
LOG_LINE_MAX + PREFIX_MAX);
if (textlen < 0) {
len = textlen;
@@ -1397,7 +1351,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
}
idx = log_next(idx);
seq++;
- prev = msg->flags;
raw_spin_unlock_irq(&logbuf_lock);
if (copy_to_user(buf + len, text, textlen))
@@ -1410,7 +1363,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
/* messages are gone, move to next one */
seq = log_first_seq;
idx = log_first_idx;
- prev = 0;
}
}
}
@@ -1511,7 +1463,6 @@ int do_syslog(int type, char __user *buf, int len, int source)
/* messages are gone, move to first one */
syslog_seq = log_first_seq;
syslog_idx = log_first_idx;
- syslog_prev = 0;
syslog_partial = 0;
}
if (source == SYSLOG_FROM_PROC) {
@@ -1524,16 +1475,14 @@ int do_syslog(int type, char __user *buf, int len, int source)
} else {
u64 seq = syslog_seq;
u32 idx = syslog_idx;
- enum log_flags prev = syslog_prev;
error = 0;
while (seq < log_next_seq) {
struct printk_log *msg = log_from_idx(idx);
- error += msg_print_text(msg, prev, true, NULL, 0);
+ error += msg_print_text(msg, true, NULL, 0);
idx = log_next(idx);
seq++;
- prev = msg->flags;
}
error -= syslog_partial;
}
@@ -1567,7 +1516,7 @@ static void call_console_drivers(int level,
{
struct console *con;
- trace_console(text, len);
+ trace_console_rcuidle(text, len);
if (!console_drivers)
return;
@@ -1634,55 +1583,32 @@ static inline void printk_delay(void)
static struct cont {
char buf[LOG_LINE_MAX];
size_t len; /* length == 0 means unused buffer */
- size_t cons; /* bytes written to console */
struct task_struct *owner; /* task of first print*/
u64 ts_nsec; /* time of first print */
u8 level; /* log level of first message */
u8 facility; /* log facility of first message */
enum log_flags flags; /* prefix, newline flags */
- bool flushed:1; /* buffer sealed and committed */
} cont;
-static void cont_flush(enum log_flags flags)
+static void cont_flush(void)
{
- if (cont.flushed)
- return;
if (cont.len == 0)
return;
- if (cont.cons) {
- /*
- * If a fragment of this line was directly flushed to the
- * console; wait for the console to pick up the rest of the
- * line. LOG_NOCONS suppresses a duplicated output.
- */
- log_store(cont.facility, cont.level, flags | LOG_NOCONS,
- cont.ts_nsec, NULL, 0, cont.buf, cont.len);
- cont.flags = flags;
- cont.flushed = true;
- } else {
- /*
- * If no fragment of this line ever reached the console,
- * just submit it to the store and free the buffer.
- */
- log_store(cont.facility, cont.level, flags, 0,
- NULL, 0, cont.buf, cont.len);
- cont.len = 0;
- }
+ log_store(cont.facility, cont.level, cont.flags, cont.ts_nsec,
+ NULL, 0, cont.buf, cont.len);
+ cont.len = 0;
}
-static bool cont_add(int facility, int level, const char *text, size_t len)
+static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len)
{
- if (cont.len && cont.flushed)
- return false;
-
/*
* If ext consoles are present, flush and skip in-kernel
* continuation. See nr_ext_console_drivers definition. Also, if
* the line gets too long, split it up in separate records.
*/
if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) {
- cont_flush(LOG_CONT);
+ cont_flush();
return false;
}
@@ -1691,46 +1617,52 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
cont.level = level;
cont.owner = current;
cont.ts_nsec = local_clock();
- cont.flags = 0;
- cont.cons = 0;
- cont.flushed = false;
+ cont.flags = flags;
}
memcpy(cont.buf + cont.len, text, len);
cont.len += len;
+ // The original flags come from the first line,
+ // but later continuations can add a newline.
+ if (flags & LOG_NEWLINE) {
+ cont.flags |= LOG_NEWLINE;
+ cont_flush();
+ }
+
if (cont.len > (sizeof(cont.buf) * 80) / 100)
- cont_flush(LOG_CONT);
+ cont_flush();
return true;
}
-static size_t cont_print_text(char *text, size_t size)
+static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len)
{
- size_t textlen = 0;
- size_t len;
-
- if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
- textlen += print_time(cont.ts_nsec, text);
- size -= textlen;
+ /*
+ * If an earlier line was buffered, and we're a continuation
+ * write from the same process, try to add it to the buffer.
+ */
+ if (cont.len) {
+ if (cont.owner == current && (lflags & LOG_CONT)) {
+ if (cont_add(facility, level, lflags, text, text_len))
+ return text_len;
+ }
+ /* Otherwise, make sure it's flushed */
+ cont_flush();
}
- len = cont.len - cont.cons;
- if (len > 0) {
- if (len+1 > size)
- len = size-1;
- memcpy(text + textlen, cont.buf + cont.cons, len);
- textlen += len;
- cont.cons = cont.len;
- }
+ /* Skip empty continuation lines that couldn't be added - they just flush */
+ if (!text_len && (lflags & LOG_CONT))
+ return 0;
- if (cont.flushed) {
- if (cont.flags & LOG_NEWLINE)
- text[textlen++] = '\n';
- /* got everything, release buffer */
- cont.len = 0;
+ /* If it doesn't end in a newline, try to buffer the current line */
+ if (!(lflags & LOG_NEWLINE)) {
+ if (cont_add(facility, level, lflags, text, text_len))
+ return text_len;
}
- return textlen;
+
+ /* Store it in the record log */
+ return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len);
}
asmlinkage int vprintk_emit(int facility, int level,
@@ -1819,10 +1751,9 @@ asmlinkage int vprintk_emit(int facility, int level,
/* strip kernel syslog prefix and extract log level or control flags */
if (facility == 0) {
- int kern_level = printk_get_level(text);
+ int kern_level;
- if (kern_level) {
- const char *end_of_header = printk_skip_level(text);
+ while ((kern_level = printk_get_level(text)) != 0) {
switch (kern_level) {
case '0' ... '7':
if (level == LOGLEVEL_DEFAULT)
@@ -1830,14 +1761,13 @@ asmlinkage int vprintk_emit(int facility, int level,
/* fallthrough */
case 'd': /* KERN_DEFAULT */
lflags |= LOG_PREFIX;
+ break;
+ case 'c': /* KERN_CONT */
+ lflags |= LOG_CONT;
}
- /*
- * No need to check length here because vscnprintf
- * put '\0' at the end of the string. Only valid and
- * newly printed level is detected.
- */
- text_len -= end_of_header - text;
- text = (char *)end_of_header;
+
+ text_len -= 2;
+ text += 2;
}
}
@@ -1847,45 +1777,7 @@ asmlinkage int vprintk_emit(int facility, int level,
if (dict)
lflags |= LOG_PREFIX|LOG_NEWLINE;
- if (!(lflags & LOG_NEWLINE)) {
- /*
- * Flush the conflicting buffer. An earlier newline was missing,
- * or another task also prints continuation lines.
- */
- if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
- cont_flush(LOG_NEWLINE);
-
- /* buffer line if possible, otherwise store it right away */
- if (cont_add(facility, level, text, text_len))
- printed_len += text_len;
- else
- printed_len += log_store(facility, level,
- lflags | LOG_CONT, 0,
- dict, dictlen, text, text_len);
- } else {
- bool stored = false;
-
- /*
- * If an earlier newline was missing and it was the same task,
- * either merge it with the current buffer and flush, or if
- * there was a race with interrupts (prefix == true) then just
- * flush it out and store this line separately.
- * If the preceding printk was from a different task and missed
- * a newline, flush and append the newline.
- */
- if (cont.len) {
- if (cont.owner == current && !(lflags & LOG_PREFIX))
- stored = cont_add(facility, level, text,
- text_len);
- cont_flush(LOG_NEWLINE);
- }
-
- if (stored)
- printed_len += text_len;
- else
- printed_len += log_store(facility, level, lflags, 0,
- dict, dictlen, text, text_len);
- }
+ printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len);
logbuf_cpu = UINT_MAX;
raw_spin_unlock(&logbuf_lock);
@@ -1935,7 +1827,8 @@ int vprintk_default(const char *fmt, va_list args)
int r;
#ifdef CONFIG_KGDB_KDB
- if (unlikely(kdb_trap_printk)) {
+ /* Allow to pass printk() to kdb but avoid a recursion. */
+ if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) {
r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
return r;
}
@@ -1989,33 +1882,24 @@ static u64 syslog_seq;
static u32 syslog_idx;
static u64 console_seq;
static u32 console_idx;
-static enum log_flags syslog_prev;
static u64 log_first_seq;
static u32 log_first_idx;
static u64 log_next_seq;
-static enum log_flags console_prev;
-static struct cont {
- size_t len;
- size_t cons;
- u8 level;
- bool flushed:1;
-} cont;
static char *log_text(const struct printk_log *msg) { return NULL; }
static char *log_dict(const struct printk_log *msg) { return NULL; }
static struct printk_log *log_from_idx(u32 idx) { return NULL; }
static u32 log_next(u32 idx) { return 0; }
static ssize_t msg_print_ext_header(char *buf, size_t size,
- struct printk_log *msg, u64 seq,
- enum log_flags prev_flags) { return 0; }
+ struct printk_log *msg,
+ u64 seq) { return 0; }
static ssize_t msg_print_ext_body(char *buf, size_t size,
char *dict, size_t dict_len,
char *text, size_t text_len) { return 0; }
static void call_console_drivers(int level,
const char *ext_text, size_t ext_len,
const char *text, size_t len) {}
-static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
+static size_t msg_print_text(const struct printk_log *msg,
bool syslog, char *buf, size_t size) { return 0; }
-static size_t cont_print_text(char *text, size_t size) { return 0; }
static bool suppress_message_printing(int level) { return false; }
/* Still needs to be defined for users */
@@ -2175,27 +2059,20 @@ void resume_console(void)
/**
* console_cpu_notify - print deferred console messages after CPU hotplug
- * @self: notifier struct
- * @action: CPU hotplug event
- * @hcpu: unused
+ * @cpu: unused
*
* If printk() is called from a CPU that is not online yet, the messages
* will be spooled but will not show up on the console. This function is
* called when a new CPU comes online (or fails to come up), and ensures
* that any such output gets printed.
*/
-static int console_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- switch (action) {
- case CPU_ONLINE:
- case CPU_DEAD:
- case CPU_DOWN_FAILED:
- case CPU_UP_CANCELED:
+static int console_cpu_notify(unsigned int cpu)
+{
+ if (!cpuhp_tasks_frozen) {
console_lock();
console_unlock();
}
- return NOTIFY_OK;
+ return 0;
}
/**
@@ -2286,42 +2163,6 @@ static inline int can_use_console(void)
return cpu_online(raw_smp_processor_id()) || have_callable_console();
}
-static void console_cont_flush(char *text, size_t size)
-{
- unsigned long flags;
- size_t len;
-
- raw_spin_lock_irqsave(&logbuf_lock, flags);
-
- if (!cont.len)
- goto out;
-
- if (suppress_message_printing(cont.level)) {
- cont.cons = cont.len;
- if (cont.flushed)
- cont.len = 0;
- goto out;
- }
-
- /*
- * We still queue earlier records, likely because the console was
- * busy. The earlier ones need to be printed before this one, we
- * did not flush any fragment so far, so just let it queue up.
- */
- if (console_seq < log_next_seq && !cont.cons)
- goto out;
-
- len = cont_print_text(text, size);
- raw_spin_unlock(&logbuf_lock);
- stop_critical_timings();
- call_console_drivers(cont.level, NULL, 0, text, len);
- start_critical_timings();
- local_irq_restore(flags);
- return;
-out:
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
-}
-
/**
* console_unlock - unlock the console system
*
@@ -2375,9 +2216,6 @@ again:
return;
}
- /* flush buffered message fragment immediately to console */
- console_cont_flush(text, sizeof(text));
-
for (;;) {
struct printk_log *msg;
size_t ext_len = 0;
@@ -2397,7 +2235,6 @@ again:
/* messages are gone, move to first one */
console_seq = log_first_seq;
console_idx = log_first_idx;
- console_prev = 0;
} else {
len = 0;
}
@@ -2407,8 +2244,7 @@ skip:
msg = log_from_idx(console_idx);
level = msg->level;
- if ((msg->flags & LOG_NOCONS) ||
- suppress_message_printing(level)) {
+ if (suppress_message_printing(level)) {
/*
* Skip record we have buffered and already printed
* directly to the console when we received it, and
@@ -2416,22 +2252,14 @@ skip:
*/
console_idx = log_next(console_idx);
console_seq++;
- /*
- * We will get here again when we register a new
- * CON_PRINTBUFFER console. Clear the flag so we
- * will properly dump everything later.
- */
- msg->flags &= ~LOG_NOCONS;
- console_prev = msg->flags;
goto skip;
}
- len += msg_print_text(msg, console_prev, false,
- text + len, sizeof(text) - len);
+ len += msg_print_text(msg, false, text + len, sizeof(text) - len);
if (nr_ext_console_drivers) {
ext_len = msg_print_ext_header(ext_text,
sizeof(ext_text),
- msg, console_seq, console_prev);
+ msg, console_seq);
ext_len += msg_print_ext_body(ext_text + ext_len,
sizeof(ext_text) - ext_len,
log_dict(msg), msg->dict_len,
@@ -2439,7 +2267,6 @@ skip:
}
console_idx = log_next(console_idx);
console_seq++;
- console_prev = msg->flags;
raw_spin_unlock(&logbuf_lock);
stop_critical_timings(); /* don't trace print latency */
@@ -2734,7 +2561,6 @@ void register_console(struct console *newcon)
raw_spin_lock_irqsave(&logbuf_lock, flags);
console_seq = syslog_seq;
console_idx = syslog_idx;
- console_prev = syslog_prev;
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
/*
* We're about to replay the log buffer. Only do this to the
@@ -2833,6 +2659,7 @@ EXPORT_SYMBOL(unregister_console);
static int __init printk_late_init(void)
{
struct console *con;
+ int ret;
for_each_console(con) {
if (!keep_bootcon && con->flags & CON_BOOT) {
@@ -2847,7 +2674,12 @@ static int __init printk_late_init(void)
unregister_console(con);
}
}
- hotcpu_notifier(console_cpu_notify, 0);
+ ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
+ console_cpu_notify);
+ WARN_ON(ret < 0);
+ ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online",
+ console_cpu_notify, NULL);
+ WARN_ON(ret < 0);
return 0;
}
late_initcall(printk_late_init);
@@ -3084,7 +2916,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
goto out;
msg = log_from_idx(dumper->cur_idx);
- l = msg_print_text(msg, 0, syslog, line, size);
+ l = msg_print_text(msg, syslog, line, size);
dumper->cur_idx = log_next(dumper->cur_idx);
dumper->cur_seq++;
@@ -3153,7 +2985,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
u32 idx;
u64 next_seq;
u32 next_idx;
- enum log_flags prev;
size_t l = 0;
bool ret = false;
@@ -3176,27 +3007,23 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
/* calculate length of entire buffer */
seq = dumper->cur_seq;
idx = dumper->cur_idx;
- prev = 0;
while (seq < dumper->next_seq) {
struct printk_log *msg = log_from_idx(idx);
- l += msg_print_text(msg, prev, true, NULL, 0);
+ l += msg_print_text(msg, true, NULL, 0);
idx = log_next(idx);
seq++;
- prev = msg->flags;
}
/* move first record forward until length fits into the buffer */
seq = dumper->cur_seq;
idx = dumper->cur_idx;
- prev = 0;
while (l > size && seq < dumper->next_seq) {
struct printk_log *msg = log_from_idx(idx);
- l -= msg_print_text(msg, prev, true, NULL, 0);
+ l -= msg_print_text(msg, true, NULL, 0);
idx = log_next(idx);
seq++;
- prev = msg->flags;
}
/* last message in next interation */
@@ -3207,10 +3034,9 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
while (seq < dumper->next_seq) {
struct printk_log *msg = log_from_idx(idx);
- l += msg_print_text(msg, prev, syslog, buf + l, size - l);
+ l += msg_print_text(msg, syslog, buf + l, size - l);
idx = log_next(idx);
seq++;
- prev = msg->flags;
}
dumper->next_seq = next_seq;
diff --git a/kernel/profile.c b/kernel/profile.c
index 2dbccf2d806c..f67ce0aa6bc4 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -408,7 +408,7 @@ void profile_tick(int type)
#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
{
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1d3b7665d0be..49ba7c1ade9d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -27,6 +27,35 @@
#include <linux/cn_proc.h>
#include <linux/compat.h>
+/*
+ * Access another process' address space via ptrace.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ if (!tsk->ptrace ||
+ (current != tsk->parent) ||
+ ((get_dumpable(mm) != SUID_DUMP_USER) &&
+ !ptracer_capable(tsk, mm->user_ns))) {
+ mmput(mm);
+ return 0;
+ }
+
+ ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
+ mmput(mm);
+
+ return ret;
+}
+
/*
* ptrace a task: make the debugger its new parent and
@@ -39,6 +68,9 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
BUG_ON(!list_empty(&child->ptrace_entry));
list_add(&child->ptrace_entry, &new_parent->ptraced);
child->parent = new_parent;
+ rcu_read_lock();
+ child->ptracer_cred = get_cred(__task_cred(new_parent));
+ rcu_read_unlock();
}
/**
@@ -71,10 +103,16 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
*/
void __ptrace_unlink(struct task_struct *child)
{
+ const struct cred *old_cred;
BUG_ON(!child->ptrace);
+ clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+
child->parent = child->real_parent;
list_del_init(&child->ptrace_entry);
+ old_cred = child->ptracer_cred;
+ child->ptracer_cred = NULL;
+ put_cred(old_cred);
spin_lock(&child->sighand->siglock);
child->ptrace = 0;
@@ -218,7 +256,7 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
{
const struct cred *cred = current_cred(), *tcred;
- int dumpable = 0;
+ struct mm_struct *mm;
kuid_t caller_uid;
kgid_t caller_gid;
@@ -269,16 +307,11 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
return -EPERM;
ok:
rcu_read_unlock();
- smp_rmb();
- if (task->mm)
- dumpable = get_dumpable(task->mm);
- rcu_read_lock();
- if (dumpable != SUID_DUMP_USER &&
- !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
- rcu_read_unlock();
- return -EPERM;
- }
- rcu_read_unlock();
+ mm = task->mm;
+ if (mm &&
+ ((get_dumpable(mm) != SUID_DUMP_USER) &&
+ !ptrace_has_cap(mm->user_ns, mode)))
+ return -EPERM;
return security_ptrace_access_check(task, mode);
}
@@ -342,10 +375,6 @@ static int ptrace_attach(struct task_struct *task, long request,
if (seize)
flags |= PT_SEIZED;
- rcu_read_lock();
- if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
- flags |= PT_PTRACE_CAP;
- rcu_read_unlock();
task->ptrace = flags;
__ptrace_link(task, current);
@@ -489,7 +518,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
/* Architecture-specific hardware disable .. */
ptrace_disable(child);
- clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
write_lock_irq(&tasklist_lock);
/*
@@ -536,7 +564,8 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
int this_len, retval;
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
- retval = access_process_vm(tsk, src, buf, this_len, 0);
+ retval = ptrace_access_vm(tsk, src, buf, this_len, FOLL_FORCE);
+
if (!retval) {
if (copied)
break;
@@ -563,7 +592,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
if (copy_from_user(buf, src, this_len))
return -EFAULT;
- retval = access_process_vm(tsk, dst, buf, this_len, 1);
+ retval = ptrace_access_vm(tsk, dst, buf, this_len,
+ FOLL_FORCE | FOLL_WRITE);
if (!retval) {
if (copied)
break;
@@ -1126,7 +1156,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
unsigned long tmp;
int copied;
- copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0);
+ copied = ptrace_access_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
if (copied != sizeof(tmp))
return -EIO;
return put_user(tmp, (unsigned long __user *)data);
@@ -1137,7 +1167,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
{
int copied;
- copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
+ copied = ptrace_access_vm(tsk, addr, &data, sizeof(data),
+ FOLL_FORCE | FOLL_WRITE);
return (copied == sizeof(data)) ? 0 : -EIO;
}
@@ -1154,7 +1185,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
switch (request) {
case PTRACE_PEEKTEXT:
case PTRACE_PEEKDATA:
- ret = access_process_vm(child, addr, &word, sizeof(word), 0);
+ ret = ptrace_access_vm(child, addr, &word, sizeof(word),
+ FOLL_FORCE);
if (ret != sizeof(word))
ret = -EIO;
else
@@ -1163,7 +1195,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
case PTRACE_POKETEXT:
case PTRACE_POKEDATA:
- ret = access_process_vm(child, addr, &data, sizeof(data), 1);
+ ret = ptrace_access_vm(child, addr, &data, sizeof(data),
+ FOLL_FORCE | FOLL_WRITE);
ret = (ret != sizeof(data) ? -EIO : 0);
break;
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 80adef7d4c3d..0d6ff3e471be 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -136,6 +136,7 @@ int rcu_jiffies_till_stall_check(void);
#define TPS(x) tracepoint_string(x)
void rcu_early_boot_tests(void);
+void rcu_test_sync_prims(void);
/*
* This function really isn't for public consumption, but RCU is special in
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index d38ab08a3fe7..123ccbd22449 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -52,7 +52,7 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
#define PERF_FLAG "-perf:"
#define PERFOUT_STRING(s) \
- pr_alert("%s" PERF_FLAG s "\n", perf_type)
+ pr_alert("%s" PERF_FLAG " %s\n", perf_type, s)
#define VERBOSE_PERFOUT_STRING(s) \
do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0)
#define VERBOSE_PERFOUT_ERRSTRING(s) \
@@ -400,9 +400,8 @@ rcu_perf_writer(void *arg)
sp.sched_priority = 0;
sched_setscheduler_nocheck(current,
SCHED_NORMAL, &sp);
- pr_alert("%s" PERF_FLAG
- "rcu_perf_writer %ld has %d measurements\n",
- perf_type, me, MIN_MEAS);
+ pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n",
+ perf_type, PERF_FLAG, me, MIN_MEAS);
if (atomic_inc_return(&n_rcu_perf_writer_finished) >=
nrealwriters) {
schedule_timeout_interruptible(10);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 971e2b138063..d81345be730e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -289,15 +289,24 @@ static int rcu_torture_read_lock(void) __acquires(RCU)
static void rcu_read_delay(struct torture_random_state *rrsp)
{
+ unsigned long started;
+ unsigned long completed;
const unsigned long shortdelay_us = 200;
const unsigned long longdelay_ms = 50;
+ unsigned long long ts;
/* We want a short delay sometimes to make a reader delay the grace
* period, and we want a long delay occasionally to trigger
* force_quiescent_state. */
- if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
+ if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
+ started = cur_ops->completed();
+ ts = rcu_trace_clock_local();
mdelay(longdelay_ms);
+ completed = cur_ops->completed();
+ do_trace_rcu_torture_read(cur_ops->name, NULL, ts,
+ started, completed);
+ }
if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
udelay(shortdelay_us);
#ifdef CONFIG_PREEMPT
@@ -555,10 +564,25 @@ static void srcu_torture_stats(void)
pr_alert("%s%s per-CPU(idx=%d):",
torture_type, TORTURE_FLAG, idx);
for_each_possible_cpu(cpu) {
+ unsigned long l0, l1;
+ unsigned long u0, u1;
long c0, c1;
+ struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
- c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx];
- c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx];
+ u0 = counts->unlock_count[!idx];
+ u1 = counts->unlock_count[idx];
+
+ /*
+ * Make sure that a lock is always counted if the corresponding
+ * unlock is counted.
+ */
+ smp_rmb();
+
+ l0 = counts->lock_count[!idx];
+ l1 = counts->lock_count[idx];
+
+ c0 = l0 - u0;
+ c1 = l1 - u1;
pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
}
pr_cont("\n");
@@ -1238,6 +1262,7 @@ rcu_torture_stats_print(void)
long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
static unsigned long rtcv_snap = ULONG_MAX;
+ struct task_struct *wtp;
for_each_possible_cpu(cpu) {
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
@@ -1258,8 +1283,9 @@ rcu_torture_stats_print(void)
atomic_read(&n_rcu_torture_alloc),
atomic_read(&n_rcu_torture_alloc_fail),
atomic_read(&n_rcu_torture_free));
- pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ",
+ pr_cont("rtmbe: %d rtbe: %ld rtbke: %ld rtbre: %ld ",
atomic_read(&n_rcu_torture_mberror),
+ n_rcu_torture_barrier_error,
n_rcu_torture_boost_ktrerror,
n_rcu_torture_boost_rterror);
pr_cont("rtbf: %ld rtb: %ld nt: %ld ",
@@ -1312,10 +1338,12 @@ rcu_torture_stats_print(void)
rcutorture_get_gp_data(cur_ops->ttype,
&flags, &gpnum, &completed);
- pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n",
+ wtp = READ_ONCE(writer_task);
+ pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n",
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
- gpnum, completed, flags);
+ gpnum, completed, flags,
+ wtp == NULL ? ~0UL : wtp->state);
show_rcu_gp_kthreads();
rcu_ftrace_dump(DUMP_ALL);
}
@@ -1362,12 +1390,12 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
onoff_interval, onoff_holdoff);
}
-static void rcutorture_booster_cleanup(int cpu)
+static int rcutorture_booster_cleanup(unsigned int cpu)
{
struct task_struct *t;
if (boost_tasks[cpu] == NULL)
- return;
+ return 0;
mutex_lock(&boost_mutex);
t = boost_tasks[cpu];
boost_tasks[cpu] = NULL;
@@ -1375,9 +1403,10 @@ static void rcutorture_booster_cleanup(int cpu)
/* This must be outside of the mutex, otherwise deadlock! */
torture_stop_kthread(rcu_torture_boost, t);
+ return 0;
}
-static int rcutorture_booster_init(int cpu)
+static int rcutorture_booster_init(unsigned int cpu)
{
int retval;
@@ -1577,28 +1606,7 @@ static void rcu_torture_barrier_cleanup(void)
}
}
-static int rcutorture_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- long cpu = (long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- (void)rcutorture_booster_init(cpu);
- break;
- case CPU_DOWN_PREPARE:
- rcutorture_booster_cleanup(cpu);
- break;
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block rcutorture_cpu_nb = {
- .notifier_call = rcutorture_cpu_notify,
-};
+static enum cpuhp_state rcutor_hp;
static void
rcu_torture_cleanup(void)
@@ -1638,11 +1646,8 @@ rcu_torture_cleanup(void)
for (i = 0; i < ncbflooders; i++)
torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]);
if ((test_boost == 1 && cur_ops->can_boost) ||
- test_boost == 2) {
- unregister_cpu_notifier(&rcutorture_cpu_nb);
- for_each_possible_cpu(i)
- rcutorture_booster_cleanup(i);
- }
+ test_boost == 2)
+ cpuhp_remove_state(rcutor_hp);
/*
* Wait for all RCU callbacks to fire, then do flavor-specific
@@ -1869,14 +1874,13 @@ rcu_torture_init(void)
test_boost == 2) {
boost_starttime = jiffies + test_boost_interval * HZ;
- register_cpu_notifier(&rcutorture_cpu_nb);
- for_each_possible_cpu(i) {
- if (cpu_is_offline(i))
- continue; /* Heuristic: CPU can go offline. */
- firsterr = rcutorture_booster_init(i);
- if (firsterr)
- goto unwind;
- }
+
+ firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE",
+ rcutorture_booster_init,
+ rcutorture_booster_cleanup);
+ if (firsterr < 0)
+ goto unwind;
+ rcutor_hp = firsterr;
}
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
if (firsterr)
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 9b9cdd549caa..e773129c8b08 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -106,7 +106,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
rcu_batch_init(&sp->batch_check1);
rcu_batch_init(&sp->batch_done);
INIT_DELAYED_WORK(&sp->work, process_srcu);
- sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
+ sp->per_cpu_ref = alloc_percpu(struct srcu_array);
return sp->per_cpu_ref ? 0 : -ENOMEM;
}
@@ -141,114 +141,77 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/*
- * Returns approximate total of the readers' ->seq[] values for the
+ * Returns approximate total of the readers' ->lock_count[] values for the
* rank of per-CPU counters specified by idx.
*/
-static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
+static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
{
int cpu;
unsigned long sum = 0;
- unsigned long t;
for_each_possible_cpu(cpu) {
- t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
- sum += t;
+ struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+ sum += READ_ONCE(cpuc->lock_count[idx]);
}
return sum;
}
/*
- * Returns approximate number of readers active on the specified rank
- * of the per-CPU ->c[] counters.
+ * Returns approximate total of the readers' ->unlock_count[] values for the
+ * rank of per-CPU counters specified by idx.
*/
-static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
{
int cpu;
unsigned long sum = 0;
- unsigned long t;
for_each_possible_cpu(cpu) {
- t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
- sum += t;
+ struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+ sum += READ_ONCE(cpuc->unlock_count[idx]);
}
return sum;
}
/*
* Return true if the number of pre-existing readers is determined to
- * be stably zero. An example unstable zero can occur if the call
- * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
- * but due to task migration, sees the corresponding __srcu_read_unlock()
- * decrement. This can happen because srcu_readers_active_idx() takes
- * time to sum the array, and might in fact be interrupted or preempted
- * partway through the summation.
+ * be zero.
*/
static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
{
- unsigned long seq;
+ unsigned long unlocks;
- seq = srcu_readers_seq_idx(sp, idx);
+ unlocks = srcu_readers_unlock_idx(sp, idx);
/*
- * The following smp_mb() A pairs with the smp_mb() B located in
- * __srcu_read_lock(). This pairing ensures that if an
- * __srcu_read_lock() increments its counter after the summation
- * in srcu_readers_active_idx(), then the corresponding SRCU read-side
- * critical section will see any changes made prior to the start
- * of the current SRCU grace period.
+ * Make sure that a lock is always counted if the corresponding unlock
+ * is counted. Needs to be a smp_mb() as the read side may contain a
+ * read from a variable that is written to before the synchronize_srcu()
+ * in the write side. In this case smp_mb()s A and B act like the store
+ * buffering pattern.
*
- * Also, if the above call to srcu_readers_seq_idx() saw the
- * increment of ->seq[], then the call to srcu_readers_active_idx()
- * must see the increment of ->c[].
+ * This smp_mb() also pairs with smp_mb() C to prevent accesses after the
+ * synchronize_srcu() from being executed before the grace period ends.
*/
smp_mb(); /* A */
/*
- * Note that srcu_readers_active_idx() can incorrectly return
- * zero even though there is a pre-existing reader throughout.
- * To see this, suppose that task A is in a very long SRCU
- * read-side critical section that started on CPU 0, and that
- * no other reader exists, so that the sum of the counters
- * is equal to one. Then suppose that task B starts executing
- * srcu_readers_active_idx(), summing up to CPU 1, and then that
- * task C starts reading on CPU 0, so that its increment is not
- * summed, but finishes reading on CPU 2, so that its decrement
- * -is- summed. Then when task B completes its sum, it will
- * incorrectly get zero, despite the fact that task A has been
- * in its SRCU read-side critical section the whole time.
- *
- * We therefore do a validation step should srcu_readers_active_idx()
- * return zero.
- */
- if (srcu_readers_active_idx(sp, idx) != 0)
- return false;
-
- /*
- * The remainder of this function is the validation step.
- * The following smp_mb() D pairs with the smp_mb() C in
- * __srcu_read_unlock(). If the __srcu_read_unlock() was seen
- * by srcu_readers_active_idx() above, then any destructive
- * operation performed after the grace period will happen after
- * the corresponding SRCU read-side critical section.
+ * If the locks are the same as the unlocks, then there must have
+ * been no readers on this index at some time in between. This does not
+ * mean that there are no more readers, as one could have read the
+ * current index but not have incremented the lock counter yet.
*
- * Note that there can be at most NR_CPUS worth of readers using
- * the old index, which is not enough to overflow even a 32-bit
- * integer. (Yes, this does mean that systems having more than
- * a billion or so CPUs need to be 64-bit systems.) Therefore,
- * the sum of the ->seq[] counters cannot possibly overflow.
- * Therefore, the only way that the return values of the two
- * calls to srcu_readers_seq_idx() can be equal is if there were
- * no increments of the corresponding rank of ->seq[] counts
- * in the interim. But the missed-increment scenario laid out
- * above includes an increment of the ->seq[] counter by
- * the corresponding __srcu_read_lock(). Therefore, if this
- * scenario occurs, the return values from the two calls to
- * srcu_readers_seq_idx() will differ, and thus the validation
- * step below suffices.
+ * Possible bug: There is no guarantee that there haven't been ULONG_MAX
+ * increments of ->lock_count[] since the unlocks were counted, meaning
+ * that this could return true even if there are still active readers.
+ * Since there are no memory barriers around srcu_flip(), the CPU is not
+ * required to increment ->completed before running
+ * srcu_readers_unlock_idx(), which means that there could be an
+ * arbitrarily large number of critical sections that execute after
+ * srcu_readers_unlock_idx() but use the old value of ->completed.
*/
- smp_mb(); /* D */
-
- return srcu_readers_seq_idx(sp, idx) == seq;
+ return srcu_readers_lock_idx(sp, idx) == unlocks;
}
/**
@@ -266,8 +229,12 @@ static bool srcu_readers_active(struct srcu_struct *sp)
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
- sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
- sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
+ struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+ sum += READ_ONCE(cpuc->lock_count[0]);
+ sum += READ_ONCE(cpuc->lock_count[1]);
+ sum -= READ_ONCE(cpuc->unlock_count[0]);
+ sum -= READ_ONCE(cpuc->unlock_count[1]);
}
return sum;
}
@@ -298,9 +265,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
int idx;
idx = READ_ONCE(sp->completed) & 0x1;
- __this_cpu_inc(sp->per_cpu_ref->c[idx]);
+ __this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
- __this_cpu_inc(sp->per_cpu_ref->seq[idx]);
return idx;
}
EXPORT_SYMBOL_GPL(__srcu_read_lock);
@@ -314,7 +280,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
void __srcu_read_unlock(struct srcu_struct *sp, int idx)
{
smp_mb(); /* C */ /* Avoid leaking the critical section. */
- this_cpu_dec(sp->per_cpu_ref->c[idx]);
+ this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -349,12 +315,21 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
/*
* Increment the ->completed counter so that future SRCU readers will
- * use the other rank of the ->c[] and ->seq[] arrays. This allows
+ * use the other rank of the ->(un)lock_count[] arrays. This allows
* us to wait for pre-existing readers in a starvation-free manner.
*/
static void srcu_flip(struct srcu_struct *sp)
{
- sp->completed++;
+ WRITE_ONCE(sp->completed, sp->completed + 1);
+
+ /*
+ * Ensure that if the updater misses an __srcu_read_unlock()
+ * increment, that task's next __srcu_read_lock() will see the
+ * above counter update. Note that both this memory barrier
+ * and the one in srcu_readers_active_idx_check() provide the
+ * guarantee for __srcu_read_lock().
+ */
+ smp_mb(); /* D */ /* Pairs with C. */
}
/*
@@ -392,6 +367,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
head->next = NULL;
head->func = func;
spin_lock_irqsave(&sp->queue_lock, flags);
+ smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
rcu_batch_queue(&sp->batch_queue, head);
if (!sp->running) {
sp->running = true;
@@ -425,6 +401,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
head->next = NULL;
head->func = wakeme_after_rcu;
spin_lock_irq(&sp->queue_lock);
+ smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
if (!sp->running) {
/* steal the processing owner */
sp->running = true;
@@ -444,8 +421,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
spin_unlock_irq(&sp->queue_lock);
}
- if (!done)
+ if (!done) {
wait_for_completion(&rcu.completion);
+ smp_mb(); /* Caller's later accesses after GP. */
+ }
+
}
/**
@@ -613,7 +593,8 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
/*
* Invoke a limited number of SRCU callbacks that have passed through
* their grace period. If there are more to do, SRCU will reschedule
- * the workqueue.
+ * the workqueue. Note that needed memory barriers have been executed
+ * in this task's context by srcu_readers_active_idx_check().
*/
static void srcu_invoke_callbacks(struct srcu_struct *sp)
{
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index be922c9f3d37..50d1861f7759 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -68,6 +68,8 @@ void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
"suspicious rcu_sync_is_idle() usage");
}
+
+EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert);
#endif
/**
@@ -83,6 +85,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
}
/**
+ * Must be called after rcu_sync_init() and before first use.
+ *
+ * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
+ * pairs turn into NO-OPs.
+ */
+void rcu_sync_enter_start(struct rcu_sync *rsp)
+{
+ rsp->gp_count++;
+ rsp->gp_state = GP_PASSED;
+}
+
+/**
* rcu_sync_enter() - Force readers onto slowpath
* @rsp: Pointer to rcu_sync structure to use for synchronization
*
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 944b1b491ed8..fa6a48d3917b 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -41,8 +41,6 @@
/* Forward declarations for tiny_plugin.h. */
struct rcu_ctrlblk;
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
-static void rcu_process_callbacks(struct softirq_action *unused);
static void __call_rcu(struct rcu_head *head,
rcu_callback_t func,
struct rcu_ctrlblk *rcp);
@@ -170,7 +168,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
false));
}
-static void rcu_process_callbacks(struct softirq_action *unused)
+static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
{
__rcu_process_callbacks(&rcu_sched_ctrlblk);
__rcu_process_callbacks(&rcu_bh_ctrlblk);
@@ -185,9 +183,6 @@ static void rcu_process_callbacks(struct softirq_action *unused)
* benefits of doing might_sleep() to reduce latency.)
*
* Cool, huh? (Due to Josh Triplett.)
- *
- * But we want to make this a static inline later. The cond_resched()
- * currently makes this problematic.
*/
void synchronize_sched(void)
{
@@ -195,7 +190,6 @@ void synchronize_sched(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_sched() in RCU read-side critical section");
- cond_resched();
}
EXPORT_SYMBOL_GPL(synchronize_sched);
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 196f0302e2f4..c64b827ecbca 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -60,12 +60,17 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
/*
* During boot, we forgive RCU lockdep issues. After this function is
- * invoked, we start taking RCU lockdep issues seriously.
+ * invoked, we start taking RCU lockdep issues seriously. Note that unlike
+ * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
+ * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
+ * The reason for this is that Tiny RCU does not need kthreads, so does
+ * not have to care about the fact that the scheduler is half-initialized
+ * at a certain phase of the boot process.
*/
void __init rcu_scheduler_starting(void)
{
WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = 1;
+ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
}
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5d80925e7fc8..d80e0d2f68c6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -41,7 +41,6 @@
#include <linux/export.h>
#include <linux/completion.h>
#include <linux/moduleparam.h>
-#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
@@ -60,7 +59,6 @@
#include "tree.h"
#include "rcu.h"
-MODULE_ALIAS("rcutree");
#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
#endif
@@ -129,13 +127,16 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
int sysctl_panic_on_rcu_stall __read_mostly;
/*
- * The rcu_scheduler_active variable transitions from zero to one just
- * before the first task is spawned. So when this variable is zero, RCU
- * can assume that there is but one task, allowing RCU to (for example)
+ * The rcu_scheduler_active variable is initialized to the value
+ * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
+ * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE,
+ * RCU can assume that there is but one task, allowing RCU to (for example)
* optimize synchronize_rcu() to a simple barrier(). When this variable
- * is one, RCU must actually do all the hard work required to detect real
- * grace periods. This variable is also used to suppress boot-time false
- * positives from lockdep-RCU error checking.
+ * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
+ * to detect real grace periods. This variable is also used to suppress
+ * boot-time false positives from lockdep-RCU error checking. Finally, it
+ * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
+ * is fully initialized, including all of its kthreads having been spawned.
*/
int rcu_scheduler_active __read_mostly;
EXPORT_SYMBOL_GPL(rcu_scheduler_active);
@@ -280,6 +281,116 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
};
+/*
+ * Record entry into an extended quiescent state. This is only to be
+ * called when not already in an extended quiescent state.
+ */
+static void rcu_dynticks_eqs_enter(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ int special;
+
+ /*
+ * CPUs seeing atomic_inc_return() must see prior RCU read-side
+ * critical sections, and we also must force ordering with the
+ * next idle sojourn.
+ */
+ special = atomic_inc_return(&rdtp->dynticks);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1);
+}
+
+/*
+ * Record exit from an extended quiescent state. This is only to be
+ * called from an extended quiescent state.
+ */
+static void rcu_dynticks_eqs_exit(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ int special;
+
+ /*
+ * CPUs seeing atomic_inc_return() must see prior idle sojourns,
+ * and we also must force ordering with the next RCU read-side
+ * critical section.
+ */
+ special = atomic_inc_return(&rdtp->dynticks);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1));
+}
+
+/*
+ * Reset the current CPU's ->dynticks counter to indicate that the
+ * newly onlined CPU is no longer in an extended quiescent state.
+ * This will either leave the counter unchanged, or increment it
+ * to the next non-quiescent value.
+ *
+ * The non-atomic test/increment sequence works because the upper bits
+ * of the ->dynticks counter are manipulated only by the corresponding CPU,
+ * or when the corresponding CPU is offline.
+ */
+static void rcu_dynticks_eqs_online(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ if (atomic_read(&rdtp->dynticks) & 0x1)
+ return;
+ atomic_add(0x1, &rdtp->dynticks);
+}
+
+/*
+ * Is the current CPU in an extended quiescent state?
+ *
+ * No ordering, as we are sampling CPU-local information.
+ */
+bool rcu_dynticks_curr_cpu_in_eqs(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ return !(atomic_read(&rdtp->dynticks) & 0x1);
+}
+
+/*
+ * Snapshot the ->dynticks counter with full ordering so as to allow
+ * stable comparison of this counter with past and future snapshots.
+ */
+int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
+{
+ int snap = atomic_add_return(0, &rdtp->dynticks);
+
+ return snap;
+}
+
+/*
+ * Return true if the snapshot returned from rcu_dynticks_snap()
+ * indicates that RCU is in an extended quiescent state.
+ */
+static bool rcu_dynticks_in_eqs(int snap)
+{
+ return !(snap & 0x1);
+}
+
+/*
+ * Return true if the CPU corresponding to the specified rcu_dynticks
+ * structure has spent some time in an extended quiescent state since
+ * rcu_dynticks_snap() returned the specified snapshot.
+ */
+static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap)
+{
+ return snap != rcu_dynticks_snap(rdtp);
+}
+
+/*
+ * Do a double-increment of the ->dynticks counter to emulate a
+ * momentary idle-CPU quiescent state.
+ */
+static void rcu_dynticks_momentary_idle(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ int special = atomic_add_return(2, &rdtp->dynticks);
+
+ /* It is illegal to call this from idle state. */
+ WARN_ON_ONCE(!(special & 0x1));
+}
+
DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
@@ -299,7 +410,6 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
static void rcu_momentary_dyntick_idle(void)
{
struct rcu_data *rdp;
- struct rcu_dynticks *rdtp;
int resched_mask;
struct rcu_state *rsp;
@@ -326,10 +436,7 @@ static void rcu_momentary_dyntick_idle(void)
* quiescent state, with no need for this CPU to do anything
* further.
*/
- rdtp = this_cpu_ptr(&rcu_dynticks);
- smp_mb__before_atomic(); /* Earlier stuff before QS. */
- atomic_add(2, &rdtp->dynticks); /* QS. */
- smp_mb__after_atomic(); /* Later stuff after QS. */
+ rcu_dynticks_momentary_idle();
break;
}
}
@@ -610,7 +717,7 @@ static int
cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
{
return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
- rdp->nxttail[RCU_DONE_TAIL] != NULL;
+ rdp->nxttail[RCU_NEXT_TAIL] != NULL;
}
/*
@@ -672,7 +779,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
{
struct rcu_state *rsp;
struct rcu_data *rdp;
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);)
trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
@@ -691,12 +798,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
do_nocb_deferred_wakeup(rdp);
}
rcu_prepare_for_idle();
- /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
- smp_mb__before_atomic(); /* See above. */
- atomic_inc(&rdtp->dynticks);
- smp_mb__after_atomic(); /* Force ordering with next sojourn. */
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- atomic_read(&rdtp->dynticks) & 0x1);
+ rcu_dynticks_eqs_enter();
rcu_dynticks_task_enter();
/*
@@ -825,15 +927,10 @@ void rcu_irq_exit_irqson(void)
*/
static void rcu_eqs_exit_common(long long oldval, int user)
{
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);)
rcu_dynticks_task_exit();
- smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */
- atomic_inc(&rdtp->dynticks);
- /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
- smp_mb__after_atomic(); /* See above. */
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- !(atomic_read(&rdtp->dynticks) & 0x1));
+ rcu_dynticks_eqs_exit();
rcu_cleanup_after_idle();
trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
@@ -979,12 +1076,8 @@ void rcu_nmi_enter(void)
* to be in the outermost NMI handler that interrupted an RCU-idle
* period (observation due to Andy Lutomirski).
*/
- if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
- smp_mb__before_atomic(); /* Force delay from prior write. */
- atomic_inc(&rdtp->dynticks);
- /* atomic_inc() before later RCU read-side crit sects */
- smp_mb__after_atomic(); /* See above. */
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ if (rcu_dynticks_curr_cpu_in_eqs()) {
+ rcu_dynticks_eqs_exit();
incby = 1;
}
rdtp->dynticks_nmi_nesting += incby;
@@ -1009,7 +1102,7 @@ void rcu_nmi_exit(void)
* to us!)
*/
WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
/*
* If the nesting level is not 1, the CPU wasn't RCU-idle, so
@@ -1022,11 +1115,7 @@ void rcu_nmi_exit(void)
/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
rdtp->dynticks_nmi_nesting = 0;
- /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
- smp_mb__before_atomic(); /* See above. */
- atomic_inc(&rdtp->dynticks);
- smp_mb__after_atomic(); /* Force delay to next write. */
- WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+ rcu_dynticks_eqs_enter();
}
/**
@@ -1039,7 +1128,7 @@ void rcu_nmi_exit(void)
*/
bool notrace __rcu_is_watching(void)
{
- return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1;
+ return !rcu_dynticks_curr_cpu_in_eqs();
}
/**
@@ -1122,9 +1211,9 @@ static int rcu_is_cpu_rrupt_from_idle(void)
static int dyntick_save_progress_counter(struct rcu_data *rdp,
bool *isidle, unsigned long *maxj)
{
- rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
+ rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
rcu_sysidle_check_cpu(rdp, isidle, maxj);
- if ((rdp->dynticks_snap & 0x1) == 0) {
+ if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
rdp->mynode->gpnum))
@@ -1143,12 +1232,10 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
bool *isidle, unsigned long *maxj)
{
- unsigned int curr;
+ unsigned long jtsq;
int *rcrmp;
- unsigned int snap;
-
- curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
- snap = (unsigned int)rdp->dynticks_snap;
+ unsigned long rjtsc;
+ struct rcu_node *rnp;
/*
* If the CPU passed through or entered a dynticks idle phase with
@@ -1158,27 +1245,39 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
* read-side critical section that started before the beginning
* of the current RCU grace period.
*/
- if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
+ if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
rdp->dynticks_fqs++;
return 1;
}
+ /* Compute and saturate jiffies_till_sched_qs. */
+ jtsq = jiffies_till_sched_qs;
+ rjtsc = rcu_jiffies_till_stall_check();
+ if (jtsq > rjtsc / 2) {
+ WRITE_ONCE(jiffies_till_sched_qs, rjtsc);
+ jtsq = rjtsc / 2;
+ } else if (jtsq < 1) {
+ WRITE_ONCE(jiffies_till_sched_qs, 1);
+ jtsq = 1;
+ }
+
/*
- * Check for the CPU being offline, but only if the grace period
- * is old enough. We don't need to worry about the CPU changing
- * state: If we see it offline even once, it has been through a
- * quiescent state.
- *
- * The reason for insisting that the grace period be at least
- * one jiffy old is that CPUs that are not quite online and that
- * have just gone offline can still execute RCU read-side critical
- * sections.
+ * Has this CPU encountered a cond_resched_rcu_qs() since the
+ * beginning of the grace period? For this to be the case,
+ * the CPU has to have noticed the current grace period. This
+ * might not be the case for nohz_full CPUs looping in the kernel.
*/
- if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies))
- return 0; /* Grace period is not old enough. */
- barrier();
- if (cpu_is_offline(rdp->cpu)) {
+ rnp = rdp->mynode;
+ if (time_after(jiffies, rdp->rsp->gp_start + jtsq) &&
+ READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) &&
+ READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) {
+ trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc"));
+ return 1;
+ }
+
+ /* Check for the CPU being offline. */
+ if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl"));
rdp->offline_fqs++;
return 1;
@@ -1206,9 +1305,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
* warning delay.
*/
rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
- if (ULONG_CMP_GE(jiffies,
- rdp->rsp->gp_start + jiffies_till_sched_qs) ||
- ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
+ if (time_after(jiffies, rdp->rsp->gp_start + jtsq) ||
+ time_after(jiffies, rdp->rsp->jiffies_resched)) {
if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
WRITE_ONCE(rdp->cond_resched_completed,
READ_ONCE(rdp->mynode->completed));
@@ -1219,11 +1317,12 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
}
- /* And if it has been a really long time, kick the CPU as well. */
- if (ULONG_CMP_GE(jiffies,
- rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) ||
- ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs))
- resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
+ /*
+ * If more than halfway to RCU CPU stall-warning time, do
+ * a resched_cpu() to try to loosen things up a bit.
+ */
+ if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2)
+ resched_cpu(rdp->cpu);
return 0;
}
@@ -1276,7 +1375,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
}
/*
- * Dump stacks of all tasks running on stalled CPUs.
+ * Dump stacks of all tasks running on stalled CPUs. First try using
+ * NMIs, but fall back to manual remote stack tracing on architectures
+ * that don't support NMI-based stack dumps. The NMI-triggered stack
+ * traces are more accurate because they are printed by the target CPU.
*/
static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
{
@@ -1286,11 +1388,10 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
rcu_for_each_leaf_node(rsp, rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if (rnp->qsmask != 0) {
- for_each_leaf_node_possible_cpu(rnp, cpu)
- if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
+ if (!trigger_single_cpu_backtrace(cpu))
dump_cpu_task(cpu);
- }
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -1306,7 +1407,8 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp)
if (!rcu_kick_kthreads)
return;
j = READ_ONCE(rsp->jiffies_kick_kthreads);
- if (time_after(jiffies, j) && rsp->gp_kthread) {
+ if (time_after(jiffies, j) && rsp->gp_kthread &&
+ (rcu_gp_in_progress(rsp) || READ_ONCE(rsp->gp_flags))) {
WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name);
rcu_ftrace_dump(DUMP_ALL);
wake_up_process(rsp->gp_kthread);
@@ -1377,6 +1479,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
(long)rsp->gpnum, (long)rsp->completed, totqlen);
if (ndetected) {
rcu_dump_cpu_stacks(rsp);
+
+ /* Complain about tasks blocking the grace period. */
+ rcu_print_detail_task_stall(rsp);
} else {
if (READ_ONCE(rsp->gpnum) != gpnum ||
READ_ONCE(rsp->completed) == gpnum) {
@@ -1393,9 +1498,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
}
}
- /* Complain about tasks blocking the grace period. */
- rcu_print_detail_task_stall(rsp);
-
rcu_check_gp_kthread_starvation(rsp);
panic_on_rcu_stall();
@@ -1848,6 +1950,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
bool ret;
+ bool need_gp;
/* Handle the ends of any preceding grace periods first. */
if (rdp->completed == rnp->completed &&
@@ -1874,9 +1977,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
*/
rdp->gpnum = rnp->gpnum;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
- rdp->cpu_no_qs.b.norm = true;
+ need_gp = !!(rnp->qsmask & rdp->grpmask);
+ rdp->cpu_no_qs.b.norm = need_gp;
rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
- rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
+ rdp->core_needs_qs = need_gp;
zero_cpu_stall_ticks(rdp);
WRITE_ONCE(rdp->gpwrap, false);
}
@@ -2344,7 +2448,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
- swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
+ rcu_gp_kthread_wake(rsp);
}
/*
@@ -2463,10 +2567,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
rnp = rdp->mynode;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if ((rdp->cpu_no_qs.b.norm &&
- rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
- rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
- rdp->gpwrap) {
+ if (rdp->cpu_no_qs.b.norm || rdp->gpnum != rnp->gpnum ||
+ rnp->completed == rnp->gpnum || rdp->gpwrap) {
/*
* The grace period in which this quiescent state was
@@ -2521,8 +2623,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
* Was there a quiescent state since the beginning of the grace
* period? If no, then exit and wait for the next call.
*/
- if (rdp->cpu_no_qs.b.norm &&
- rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
+ if (rdp->cpu_no_qs.b.norm)
return;
/*
@@ -2828,8 +2929,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
* Also schedule RCU core processing.
*
* This function must be called from hardirq context. It is normally
- * invoked from the scheduling-clock interrupt. If rcu_pending returns
- * false, there is no point in invoking rcu_check_callbacks().
+ * invoked from the scheduling-clock interrupt.
*/
void rcu_check_callbacks(int user)
{
@@ -2970,7 +3070,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
}
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
- swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
+ rcu_gp_kthread_wake(rsp);
}
/*
@@ -3013,7 +3113,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
/*
* Do RCU core processing for the current CPU.
*/
-static void rcu_process_callbacks(struct softirq_action *unused)
+static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
{
struct rcu_state *rsp;
@@ -3121,7 +3221,9 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
unsigned long flags;
struct rcu_data *rdp;
- WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
+ /* Misaligned rcu_head! */
+ WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
+
if (debug_rcu_head_queue(head)) {
/* Probable double call_rcu(), so leak the callback. */
WRITE_ONCE(head->func, rcu_leak_callback);
@@ -3130,13 +3232,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
}
head->func = func;
head->next = NULL;
-
- /*
- * Opportunistically note grace-period endings and beginnings.
- * Note that we might see a beginning right after we see an
- * end, but never vice versa, since this CPU has to pass through
- * a quiescent state betweentimes.
- */
local_irq_save(flags);
rdp = this_cpu_ptr(rsp->rda);
@@ -3482,9 +3577,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
rdp->n_rp_core_needs_qs++;
- } else if (rdp->core_needs_qs &&
- (!rdp->cpu_no_qs.b.norm ||
- rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
+ } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) {
rdp->n_rp_report_qs++;
return 1;
}
@@ -3750,7 +3843,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
- WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
+ WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks)));
rdp->cpu = cpu;
rdp->rsp = rsp;
rcu_boot_init_nocb_percpu_data(rdp);
@@ -3767,7 +3860,6 @@ static void
rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
- unsigned long mask;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
@@ -3780,8 +3872,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
rcu_sysidle_init_percpu_data(rdp->dynticks);
- atomic_set(&rdp->dynticks->dynticks,
- (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
+ rcu_dynticks_eqs_online();
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
/*
@@ -3790,10 +3881,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
* of the next grace period.
*/
rnp = rdp->mynode;
- mask = rdp->grpmask;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- rnp->qsmaskinitnext |= mask;
- rnp->expmaskinitnext |= mask;
if (!rdp->beenonline)
WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
rdp->beenonline = true; /* We have now been online. */
@@ -3860,6 +3948,32 @@ int rcutree_dead_cpu(unsigned int cpu)
return 0;
}
+/*
+ * Mark the specified CPU as being online so that subsequent grace periods
+ * (both expedited and normal) will wait on it. Note that this means that
+ * incoming CPUs are not allowed to use RCU read-side critical sections
+ * until this function is called. Failing to observe this restriction
+ * will result in lockdep splats.
+ */
+void rcu_cpu_starting(unsigned int cpu)
+{
+ unsigned long flags;
+ unsigned long mask;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ rnp = rdp->mynode;
+ mask = rdp->grpmask;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rnp->qsmaskinitnext |= mask;
+ rnp->expmaskinitnext |= mask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+}
+
#ifdef CONFIG_HOTPLUG_CPU
/*
* The CPU is exiting the idle loop into the arch_cpu_idle_dead()
@@ -3961,18 +4075,22 @@ static int __init rcu_spawn_gp_kthread(void)
early_initcall(rcu_spawn_gp_kthread);
/*
- * This function is invoked towards the end of the scheduler's initialization
- * process. Before this is called, the idle task might contain
- * RCU read-side critical sections (during which time, this idle
- * task is booting the system). After this function is called, the
- * idle tasks are prohibited from containing RCU read-side critical
- * sections. This function also enables RCU lockdep checking.
+ * This function is invoked towards the end of the scheduler's
+ * initialization process. Before this is called, the idle task might
+ * contain synchronous grace-period primitives (during which time, this idle
+ * task is booting the system, and such primitives are no-ops). After this
+ * function is called, any synchronous grace-period primitives are run as
+ * expedited, with the requesting task driving the grace period forward.
+ * A later core_initcall() rcu_exp_runtime_mode() will switch to full
+ * runtime RCU functionality.
*/
void rcu_scheduler_starting(void)
{
WARN_ON(num_online_cpus() != 1);
WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = 1;
+ rcu_test_sync_prims();
+ rcu_scheduler_active = RCU_SCHEDULER_INIT;
+ rcu_test_sync_prims();
}
/*
@@ -4209,8 +4327,10 @@ void __init rcu_init(void)
* or the scheduler are operational.
*/
pm_notifier(rcu_pm_notify, 0);
- for_each_online_cpu(cpu)
+ for_each_online_cpu(cpu) {
rcutree_prepare_cpu(cpu);
+ rcu_cpu_starting(cpu);
+ }
}
#include "tree_exp.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index f714f873bf9d..b60f2b6caa14 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -400,9 +400,11 @@ struct rcu_data {
#ifdef CONFIG_RCU_FAST_NO_HZ
struct rcu_head oom_head;
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+ atomic_long_t exp_workdone0; /* # done by workqueue. */
atomic_long_t exp_workdone1; /* # done by others #1. */
atomic_long_t exp_workdone2; /* # done by others #2. */
atomic_long_t exp_workdone3; /* # done by others #3. */
+ int exp_dynticks_snap; /* Double-check need for IPI. */
/* 7) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
@@ -519,7 +521,6 @@ struct rcu_state {
struct mutex exp_mutex; /* Serialize expedited GP. */
struct mutex exp_wake_mutex; /* Serialize wakeup. */
unsigned long expedited_sequence; /* Take a ticket. */
- atomic_long_t expedited_normal; /* # fallbacks to normal. */
atomic_t expedited_need_qs; /* # CPUs left to check in. */
struct swait_queue_head expedited_wq; /* Wait for check-ins. */
int ncpus_snap; /* # CPUs seen last time. */
@@ -593,6 +594,8 @@ extern struct rcu_state rcu_bh_state;
extern struct rcu_state rcu_preempt_state;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
+int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
+
#ifdef CONFIG_RCU_BOOST
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
@@ -686,18 +689,6 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
#endif /* #ifdef CONFIG_RCU_TRACE */
/*
- * Place this after a lock-acquisition primitive to guarantee that
- * an UNLOCK+LOCK pair act as a full barrier. This guarantee applies
- * if the UNLOCK and LOCK are executed by the same CPU or if the
- * UNLOCK and LOCK operate on the same lock variable.
- */
-#ifdef CONFIG_PPC
-#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */
-#else /* #ifdef CONFIG_PPC */
-#define smp_mb__after_unlock_lock() do { } while (0)
-#endif /* #else #ifdef CONFIG_PPC */
-
-/*
* Wrappers for the rcu_node::lock acquire and release.
*
* Because the rcu_nodes form a tree, the tree traversal locking will observe
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 6d86ab6ec2c9..a7b639ccd46e 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -20,16 +20,26 @@
* Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
*/
-/* Wrapper functions for expedited grace periods. */
+/*
+ * Record the start of an expedited grace period.
+ */
static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
{
rcu_seq_start(&rsp->expedited_sequence);
}
+
+/*
+ * Record the end of an expedited grace period.
+ */
static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
{
rcu_seq_end(&rsp->expedited_sequence);
smp_mb(); /* Ensure that consecutive grace periods serialize. */
}
+
+/*
+ * Take a snapshot of the expedited-grace-period counter.
+ */
static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
{
unsigned long s;
@@ -39,6 +49,12 @@ static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
return s;
}
+
+/*
+ * Given a counter snapshot from rcu_exp_gp_seq_snap(), return true
+ * if a full expedited grace period has elapsed since that snapshot
+ * was taken.
+ */
static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
{
return rcu_seq_done(&rsp->expedited_sequence, s);
@@ -356,10 +372,12 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
mask_ofl_test = 0;
for_each_leaf_node_possible_cpu(rnp, cpu) {
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+ rdp->exp_dynticks_snap =
+ rcu_dynticks_snap(rdp->dynticks);
if (raw_smp_processor_id() == cpu ||
- !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) ||
+ !(rnp->qsmaskinitnext & rdp->grpmask))
mask_ofl_test |= rdp->grpmask;
}
mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
@@ -376,25 +394,31 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
/* IPI the remaining CPUs for expedited quiescent state. */
for_each_leaf_node_possible_cpu(rnp, cpu) {
unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+
if (!(mask_ofl_ipi & mask))
continue;
retry_ipi:
+ if (rcu_dynticks_in_eqs_since(rdp->dynticks,
+ rdp->exp_dynticks_snap)) {
+ mask_ofl_test |= mask;
+ continue;
+ }
ret = smp_call_function_single(cpu, func, rsp, 0);
if (!ret) {
mask_ofl_ipi &= ~mask;
continue;
}
- /* Failed, raced with offline. */
+ /* Failed, raced with CPU hotplug operation. */
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if (cpu_online(cpu) &&
+ if ((rnp->qsmaskinitnext & mask) &&
(rnp->expmask & mask)) {
+ /* Online, so delay for a bit and try again. */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
schedule_timeout_uninterruptible(1);
- if (cpu_online(cpu) &&
- (rnp->expmask & mask))
- goto retry_ipi;
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ goto retry_ipi;
}
+ /* CPU really is offline, so we can ignore it. */
if (!(rnp->expmask & mask))
mask_ofl_ipi &= ~mask;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -427,12 +451,10 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
jiffies_stall);
if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
return;
- if (ret < 0) {
- /* Hit a signal, disable CPU stall warnings. */
- swait_event(rsp->expedited_wq,
- sync_rcu_preempt_exp_done(rnp_root));
- return;
- }
+ WARN_ON(ret < 0); /* workqueues should not be signaled. */
+ if (rcu_cpu_stall_suppress)
+ continue;
+ panic_on_rcu_stall();
pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
rsp->name);
ndetected = 0;
@@ -500,7 +522,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
* next GP, to proceed.
*/
mutex_lock(&rsp->exp_wake_mutex);
- mutex_unlock(&rsp->exp_mutex);
rcu_for_each_node_breadth_first(rsp, rnp) {
if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
@@ -516,6 +537,86 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
mutex_unlock(&rsp->exp_wake_mutex);
}
+/* Let the workqueue handler know what it is supposed to do. */
+struct rcu_exp_work {
+ smp_call_func_t rew_func;
+ struct rcu_state *rew_rsp;
+ unsigned long rew_s;
+ struct work_struct rew_work;
+};
+
+/*
+ * Common code to drive an expedited grace period forward, used by
+ * workqueues and mid-boot-time tasks.
+ */
+static void rcu_exp_sel_wait_wake(struct rcu_state *rsp,
+ smp_call_func_t func, unsigned long s)
+{
+ /* Initialize the rcu_node tree in preparation for the wait. */
+ sync_rcu_exp_select_cpus(rsp, func);
+
+ /* Wait and clean up, including waking everyone. */
+ rcu_exp_wait_wake(rsp, s);
+}
+
+/*
+ * Work-queue handler to drive an expedited grace period forward.
+ */
+static void wait_rcu_exp_gp(struct work_struct *wp)
+{
+ struct rcu_exp_work *rewp;
+
+ rewp = container_of(wp, struct rcu_exp_work, rew_work);
+ rcu_exp_sel_wait_wake(rewp->rew_rsp, rewp->rew_func, rewp->rew_s);
+}
+
+/*
+ * Given an rcu_state pointer and a smp_call_function() handler, kick
+ * off the specified flavor of expedited grace period.
+ */
+static void _synchronize_rcu_expedited(struct rcu_state *rsp,
+ smp_call_func_t func)
+{
+ struct rcu_data *rdp;
+ struct rcu_exp_work rew;
+ struct rcu_node *rnp;
+ unsigned long s;
+
+ /* If expedited grace periods are prohibited, fall back to normal. */
+ if (rcu_gp_is_normal()) {
+ wait_rcu_gp(rsp->call);
+ return;
+ }
+
+ /* Take a snapshot of the sequence number. */
+ s = rcu_exp_gp_seq_snap(rsp);
+ if (exp_funnel_lock(rsp, s))
+ return; /* Someone else did our work for us. */
+
+ /* Ensure that load happens before action based on it. */
+ if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
+ /* Direct call during scheduler init and early_initcalls(). */
+ rcu_exp_sel_wait_wake(rsp, func, s);
+ } else {
+ /* Marshall arguments & schedule the expedited grace period. */
+ rew.rew_func = func;
+ rew.rew_rsp = rsp;
+ rew.rew_s = s;
+ INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
+ schedule_work(&rew.rew_work);
+ }
+
+ /* Wait for expedited grace period to complete. */
+ rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
+ rnp = rcu_get_root(rsp);
+ wait_event(rnp->exp_wq[(s >> 1) & 0x3],
+ sync_exp_work_done(rsp,
+ &rdp->exp_workdone0, s));
+
+ /* Let the next expedited grace period start. */
+ mutex_unlock(&rsp->exp_mutex);
+}
+
/**
* synchronize_sched_expedited - Brute-force RCU-sched grace period
*
@@ -534,29 +635,18 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
*/
void synchronize_sched_expedited(void)
{
- unsigned long s;
struct rcu_state *rsp = &rcu_sched_state;
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_sched_expedited() in RCU read-side critical section");
+
/* If only one CPU, this is automatically a grace period. */
if (rcu_blocking_is_gp())
return;
- /* If expedited grace periods are prohibited, fall back to normal. */
- if (rcu_gp_is_normal()) {
- wait_rcu_gp(call_rcu_sched);
- return;
- }
-
- /* Take a snapshot of the sequence number. */
- s = rcu_exp_gp_seq_snap(rsp);
- if (exp_funnel_lock(rsp, s))
- return; /* Someone else did our work for us. */
-
- /* Initialize the rcu_node tree in preparation for the wait. */
- sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
-
- /* Wait and clean up, including waking everyone. */
- rcu_exp_wait_wake(rsp, s);
+ _synchronize_rcu_expedited(rsp, sync_sched_exp_handler);
}
EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
@@ -620,23 +710,15 @@ static void sync_rcu_exp_handler(void *info)
void synchronize_rcu_expedited(void)
{
struct rcu_state *rsp = rcu_state_p;
- unsigned long s;
- /* If expedited grace periods are prohibited, fall back to normal. */
- if (rcu_gp_is_normal()) {
- wait_rcu_gp(call_rcu);
- return;
- }
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu_expedited() in RCU read-side critical section");
- s = rcu_exp_gp_seq_snap(rsp);
- if (exp_funnel_lock(rsp, s))
- return; /* Someone else did our work for us. */
-
- /* Initialize the rcu_node tree in preparation for the wait. */
- sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
-
- /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */
- rcu_exp_wait_wake(rsp, s);
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+ return;
+ _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
@@ -653,3 +735,15 @@ void synchronize_rcu_expedited(void)
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
+/*
+ * Switch to run-time mode once Tree RCU has fully initialized.
+ */
+static int __init rcu_exp_runtime_mode(void)
+{
+ rcu_test_sync_prims();
+ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+ rcu_test_sync_prims();
+ return 0;
+}
+core_initcall(rcu_exp_runtime_mode);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0082fce402a0..a240f3308be6 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -670,7 +670,7 @@ void synchronize_rcu(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
- if (!rcu_scheduler_active)
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
return;
if (rcu_gp_is_expedited())
synchronize_rcu_expedited();
@@ -1643,7 +1643,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
"o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
"N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
ticks_value, ticks_title,
- atomic_read(&rdtp->dynticks) & 0xfff,
+ rcu_dynticks_snap(rdtp) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
@@ -2173,6 +2173,7 @@ static int rcu_nocb_kthread(void *arg)
cl++;
c++;
local_bh_enable();
+ cond_resched_rcu_qs();
list = next;
}
trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
@@ -2365,8 +2366,9 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp)
}
/*
- * Each pass through this loop sets up one rcu_data structure and
- * spawns one rcu_nocb_kthread().
+ * Each pass through this loop sets up one rcu_data structure.
+ * Should the corresponding CPU come online in the future, then
+ * we will spawn the needed set of rcu_nocb_kthread() kthreads.
*/
for_each_cpu(cpu, rcu_nocb_mask) {
rdp = per_cpu_ptr(rsp->rda, cpu);
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 86782f9a4604..8751a748499a 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -124,7 +124,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
rdp->core_needs_qs);
seq_printf(m, " dt=%d/%llx/%d df=%lu",
- atomic_read(&rdp->dynticks->dynticks),
+ rcu_dynticks_snap(rdp->dynticks),
rdp->dynticks->dynticks_nesting,
rdp->dynticks->dynticks_nmi_nesting,
rdp->dynticks_fqs);
@@ -185,17 +185,17 @@ static int show_rcuexp(struct seq_file *m, void *v)
int cpu;
struct rcu_state *rsp = (struct rcu_state *)m->private;
struct rcu_data *rdp;
- unsigned long s1 = 0, s2 = 0, s3 = 0;
+ unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(rsp->rda, cpu);
+ s0 += atomic_long_read(&rdp->exp_workdone0);
s1 += atomic_long_read(&rdp->exp_workdone1);
s2 += atomic_long_read(&rdp->exp_workdone2);
s3 += atomic_long_read(&rdp->exp_workdone3);
}
- seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
- rsp->expedited_sequence, s1, s2, s3,
- atomic_long_read(&rsp->expedited_normal),
+ seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n",
+ rsp->expedited_sequence, s0, s1, s2, s3,
atomic_read(&rsp->expedited_need_qs),
rsp->expedited_sequence / 2);
return 0;
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index f0d8322bc3ec..9e03db9ea9c0 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -46,7 +46,7 @@
#include <linux/export.h>
#include <linux/hardirq.h>
#include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/kthread.h>
#include <linux/tick.h>
@@ -54,7 +54,6 @@
#include "rcu.h"
-MODULE_ALIAS("rcupdate");
#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
#endif
@@ -122,27 +121,30 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
* Should expedited grace-period primitives always fall back to their
* non-expedited counterparts? Intended for use within RCU. Note
* that if the user specifies both rcu_expedited and rcu_normal, then
- * rcu_normal wins.
+ * rcu_normal wins. (Except during the time period during boot from
+ * when the first task is spawned until the rcu_exp_runtime_mode()
+ * core_initcall() is invoked, at which point everything is expedited.)
*/
bool rcu_gp_is_normal(void)
{
- return READ_ONCE(rcu_normal);
+ return READ_ONCE(rcu_normal) &&
+ rcu_scheduler_active != RCU_SCHEDULER_INIT;
}
EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
-static atomic_t rcu_expedited_nesting =
- ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
+static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
/*
* Should normal grace-period primitives be expedited? Intended for
* use within RCU. Note that this function takes the rcu_expedited
- * sysfs/boot variable into account as well as the rcu_expedite_gp()
- * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
- * returns false is a -really- bad idea.
+ * sysfs/boot variable and rcu_scheduler_active into account as well
+ * as the rcu_expedite_gp() nesting. So looping on rcu_unexpedite_gp()
+ * until rcu_gp_is_expedited() returns false is a -really- bad idea.
*/
bool rcu_gp_is_expedited(void)
{
- return rcu_expedited || atomic_read(&rcu_expedited_nesting);
+ return rcu_expedited || atomic_read(&rcu_expedited_nesting) ||
+ rcu_scheduler_active == RCU_SCHEDULER_INIT;
}
EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
@@ -179,8 +181,7 @@ EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
*/
void rcu_end_inkernel_boot(void)
{
- if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
- rcu_unexpedite_gp();
+ rcu_unexpedite_gp();
if (rcu_normal_after_boot)
WRITE_ONCE(rcu_normal, 1);
}
@@ -258,7 +259,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map);
int notrace debug_lockdep_rcu_enabled(void)
{
- return rcu_scheduler_active && debug_locks &&
+ return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks &&
current->lockdep_recursion == 0;
}
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
@@ -592,7 +593,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
void synchronize_rcu_tasks(void)
{
/* Complain if the scheduler has not started. */
- RCU_LOCKDEP_WARN(!rcu_scheduler_active,
+ RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
"synchronize_rcu_tasks called too soon");
/* Wait for the grace period. */
@@ -814,6 +815,23 @@ static void rcu_spawn_tasks_kthread(void)
#endif /* #ifdef CONFIG_TASKS_RCU */
+/*
+ * Test each non-SRCU synchronous grace-period wait API. This is
+ * useful just after a change in mode for these primitives, and
+ * during early boot.
+ */
+void rcu_test_sync_prims(void)
+{
+ if (!IS_ENABLED(CONFIG_PROVE_RCU))
+ return;
+ synchronize_rcu();
+ synchronize_rcu_bh();
+ synchronize_sched();
+ synchronize_rcu_expedited();
+ synchronize_rcu_bh_expedited();
+ synchronize_sched_expedited();
+}
+
#ifdef CONFIG_PROVE_RCU
/*
@@ -866,6 +884,7 @@ void rcu_early_boot_tests(void)
early_boot_test_call_rcu_bh();
if (rcu_self_test_sched)
early_boot_test_call_rcu_sched();
+ rcu_test_sync_prims();
}
static int rcu_verify_early_boot_tests(void)
diff --git a/kernel/relay.c b/kernel/relay.c
index d797502140b9..8f18d314a96a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -214,7 +214,7 @@ static void relay_destroy_buf(struct rchan_buf *buf)
__free_page(buf->page_array[i]);
relay_free_page_array(buf->page_array);
}
- chan->buf[buf->cpu] = NULL;
+ *per_cpu_ptr(chan->buf, buf->cpu) = NULL;
kfree(buf->padding);
kfree(buf);
kref_put(&chan->kref, relay_destroy_channel);
@@ -328,13 +328,15 @@ static struct rchan_callbacks default_channel_callbacks = {
/**
* wakeup_readers - wake up readers waiting on a channel
- * @data: contains the channel buffer
+ * @work: contains the channel buffer
*
- * This is the timer function used to defer reader waking.
+ * This is the function used to defer reader waking
*/
-static void wakeup_readers(unsigned long data)
+static void wakeup_readers(struct irq_work *work)
{
- struct rchan_buf *buf = (struct rchan_buf *)data;
+ struct rchan_buf *buf;
+
+ buf = container_of(work, struct rchan_buf, wakeup_work);
wake_up_interruptible(&buf->read_wait);
}
@@ -352,9 +354,10 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
if (init) {
init_waitqueue_head(&buf->read_wait);
kref_init(&buf->kref);
- setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
- } else
- del_timer_sync(&buf->timer);
+ init_irq_work(&buf->wakeup_work, wakeup_readers);
+ } else {
+ irq_work_sync(&buf->wakeup_work);
+ }
buf->subbufs_produced = 0;
buf->subbufs_consumed = 0;
@@ -382,20 +385,21 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
*/
void relay_reset(struct rchan *chan)
{
+ struct rchan_buf *buf;
unsigned int i;
if (!chan)
return;
- if (chan->is_global && chan->buf[0]) {
- __relay_reset(chan->buf[0], 0);
+ if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) {
+ __relay_reset(buf, 0);
return;
}
mutex_lock(&relay_channels_mutex);
for_each_possible_cpu(i)
- if (chan->buf[i])
- __relay_reset(chan->buf[i], 0);
+ if ((buf = *per_cpu_ptr(chan->buf, i)))
+ __relay_reset(buf, 0);
mutex_unlock(&relay_channels_mutex);
}
EXPORT_SYMBOL_GPL(relay_reset);
@@ -440,7 +444,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
struct dentry *dentry;
if (chan->is_global)
- return chan->buf[0];
+ return *per_cpu_ptr(chan->buf, 0);
buf = relay_create_buf(chan);
if (!buf)
@@ -464,7 +468,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
__relay_reset(buf, 1);
if(chan->is_global) {
- chan->buf[0] = buf;
+ *per_cpu_ptr(chan->buf, 0) = buf;
buf->cpu = 0;
}
@@ -486,7 +490,7 @@ free_buf:
static void relay_close_buf(struct rchan_buf *buf)
{
buf->finalized = 1;
- del_timer_sync(&buf->timer);
+ irq_work_sync(&buf->wakeup_work);
buf->chan->cb->remove_buf_file(buf->dentry);
kref_put(&buf->kref, relay_remove_buf);
}
@@ -512,46 +516,25 @@ static void setup_callbacks(struct rchan *chan,
chan->cb = cb;
}
-/**
- * relay_hotcpu_callback - CPU hotplug callback
- * @nb: notifier block
- * @action: hotplug action to take
- * @hcpu: CPU number
- *
- * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
- */
-static int relay_hotcpu_callback(struct notifier_block *nb,
- unsigned long action,
- void *hcpu)
+int relay_prepare_cpu(unsigned int cpu)
{
- unsigned int hotcpu = (unsigned long)hcpu;
struct rchan *chan;
+ struct rchan_buf *buf;
- switch(action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- mutex_lock(&relay_channels_mutex);
- list_for_each_entry(chan, &relay_channels, list) {
- if (chan->buf[hotcpu])
- continue;
- chan->buf[hotcpu] = relay_open_buf(chan, hotcpu);
- if(!chan->buf[hotcpu]) {
- printk(KERN_ERR
- "relay_hotcpu_callback: cpu %d buffer "
- "creation failed\n", hotcpu);
- mutex_unlock(&relay_channels_mutex);
- return notifier_from_errno(-ENOMEM);
- }
+ mutex_lock(&relay_channels_mutex);
+ list_for_each_entry(chan, &relay_channels, list) {
+ if ((buf = *per_cpu_ptr(chan->buf, cpu)))
+ continue;
+ buf = relay_open_buf(chan, cpu);
+ if (!buf) {
+ pr_err("relay: cpu %d buffer creation failed\n", cpu);
+ mutex_unlock(&relay_channels_mutex);
+ return -ENOMEM;
}
- mutex_unlock(&relay_channels_mutex);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- /* No need to flush the cpu : will be flushed upon
- * final relay_flush() call. */
- break;
+ *per_cpu_ptr(chan->buf, cpu) = buf;
}
- return NOTIFY_OK;
+ mutex_unlock(&relay_channels_mutex);
+ return 0;
}
/**
@@ -583,6 +566,7 @@ struct rchan *relay_open(const char *base_filename,
{
unsigned int i;
struct rchan *chan;
+ struct rchan_buf *buf;
if (!(subbuf_size && n_subbufs))
return NULL;
@@ -593,6 +577,7 @@ struct rchan *relay_open(const char *base_filename,
if (!chan)
return NULL;
+ chan->buf = alloc_percpu(struct rchan_buf *);
chan->version = RELAYFS_CHANNEL_VERSION;
chan->n_subbufs = n_subbufs;
chan->subbuf_size = subbuf_size;
@@ -608,9 +593,10 @@ struct rchan *relay_open(const char *base_filename,
mutex_lock(&relay_channels_mutex);
for_each_online_cpu(i) {
- chan->buf[i] = relay_open_buf(chan, i);
- if (!chan->buf[i])
+ buf = relay_open_buf(chan, i);
+ if (!buf)
goto free_bufs;
+ *per_cpu_ptr(chan->buf, i) = buf;
}
list_add(&chan->list, &relay_channels);
mutex_unlock(&relay_channels_mutex);
@@ -619,8 +605,8 @@ struct rchan *relay_open(const char *base_filename,
free_bufs:
for_each_possible_cpu(i) {
- if (chan->buf[i])
- relay_close_buf(chan->buf[i]);
+ if ((buf = *per_cpu_ptr(chan->buf, i)))
+ relay_close_buf(buf);
}
kref_put(&chan->kref, relay_destroy_channel);
@@ -666,6 +652,7 @@ int relay_late_setup_files(struct rchan *chan,
unsigned int i, curr_cpu;
unsigned long flags;
struct dentry *dentry;
+ struct rchan_buf *buf;
struct rchan_percpu_buf_dispatcher disp;
if (!chan || !base_filename)
@@ -684,10 +671,11 @@ int relay_late_setup_files(struct rchan *chan,
if (chan->is_global) {
err = -EINVAL;
- if (!WARN_ON_ONCE(!chan->buf[0])) {
- dentry = relay_create_buf_file(chan, chan->buf[0], 0);
+ buf = *per_cpu_ptr(chan->buf, 0);
+ if (!WARN_ON_ONCE(!buf)) {
+ dentry = relay_create_buf_file(chan, buf, 0);
if (dentry && !WARN_ON_ONCE(!chan->is_global)) {
- relay_set_buf_dentry(chan->buf[0], dentry);
+ relay_set_buf_dentry(buf, dentry);
err = 0;
}
}
@@ -702,13 +690,14 @@ int relay_late_setup_files(struct rchan *chan,
* on all currently online CPUs.
*/
for_each_online_cpu(i) {
- if (unlikely(!chan->buf[i])) {
+ buf = *per_cpu_ptr(chan->buf, i);
+ if (unlikely(!buf)) {
WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
err = -EINVAL;
break;
}
- dentry = relay_create_buf_file(chan, chan->buf[i], i);
+ dentry = relay_create_buf_file(chan, buf, i);
if (unlikely(!dentry)) {
err = -EINVAL;
break;
@@ -716,10 +705,10 @@ int relay_late_setup_files(struct rchan *chan,
if (curr_cpu == i) {
local_irq_save(flags);
- relay_set_buf_dentry(chan->buf[i], dentry);
+ relay_set_buf_dentry(buf, dentry);
local_irq_restore(flags);
} else {
- disp.buf = chan->buf[i];
+ disp.buf = buf;
disp.dentry = dentry;
smp_mb();
/* relay_channels_mutex must be held, so wait. */
@@ -768,14 +757,15 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
buf->early_bytes += buf->chan->subbuf_size -
buf->padding[old_subbuf];
smp_mb();
- if (waitqueue_active(&buf->read_wait))
+ if (waitqueue_active(&buf->read_wait)) {
/*
* Calling wake_up_interruptible() from here
* will deadlock if we happen to be logging
* from the scheduler (trying to re-grab
* rq->lock), so defer it.
*/
- mod_timer(&buf->timer, jiffies + 1);
+ irq_work_queue(&buf->wakeup_work);
+ }
}
old = buf->data;
@@ -819,14 +809,13 @@ void relay_subbufs_consumed(struct rchan *chan,
{
struct rchan_buf *buf;
- if (!chan)
+ if (!chan || cpu >= NR_CPUS)
return;
- if (cpu >= NR_CPUS || !chan->buf[cpu] ||
- subbufs_consumed > chan->n_subbufs)
+ buf = *per_cpu_ptr(chan->buf, cpu);
+ if (!buf || subbufs_consumed > chan->n_subbufs)
return;
- buf = chan->buf[cpu];
if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
buf->subbufs_consumed = buf->subbufs_produced;
else
@@ -842,18 +831,19 @@ EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
*/
void relay_close(struct rchan *chan)
{
+ struct rchan_buf *buf;
unsigned int i;
if (!chan)
return;
mutex_lock(&relay_channels_mutex);
- if (chan->is_global && chan->buf[0])
- relay_close_buf(chan->buf[0]);
+ if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0)))
+ relay_close_buf(buf);
else
for_each_possible_cpu(i)
- if (chan->buf[i])
- relay_close_buf(chan->buf[i]);
+ if ((buf = *per_cpu_ptr(chan->buf, i)))
+ relay_close_buf(buf);
if (chan->last_toobig)
printk(KERN_WARNING "relay: one or more items not logged "
@@ -874,20 +864,21 @@ EXPORT_SYMBOL_GPL(relay_close);
*/
void relay_flush(struct rchan *chan)
{
+ struct rchan_buf *buf;
unsigned int i;
if (!chan)
return;
- if (chan->is_global && chan->buf[0]) {
- relay_switch_subbuf(chan->buf[0], 0);
+ if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) {
+ relay_switch_subbuf(buf, 0);
return;
}
mutex_lock(&relay_channels_mutex);
for_each_possible_cpu(i)
- if (chan->buf[i])
- relay_switch_subbuf(chan->buf[i], 0);
+ if ((buf = *per_cpu_ptr(chan->buf, i)))
+ relay_switch_subbuf(buf, 0);
mutex_unlock(&relay_channels_mutex);
}
EXPORT_SYMBOL_GPL(relay_flush);
@@ -1121,51 +1112,23 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
return end_pos;
}
-/*
- * subbuf_read_actor - read up to one subbuf's worth of data
- */
-static int subbuf_read_actor(size_t read_start,
- struct rchan_buf *buf,
- size_t avail,
- read_descriptor_t *desc)
-{
- void *from;
- int ret = 0;
-
- from = buf->start + read_start;
- ret = avail;
- if (copy_to_user(desc->arg.buf, from, avail)) {
- desc->error = -EFAULT;
- ret = 0;
- }
- desc->arg.data += ret;
- desc->written += ret;
- desc->count -= ret;
-
- return ret;
-}
-
-typedef int (*subbuf_actor_t) (size_t read_start,
- struct rchan_buf *buf,
- size_t avail,
- read_descriptor_t *desc);
-
-/*
- * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
- */
-static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
- subbuf_actor_t subbuf_actor,
- read_descriptor_t *desc)
+static ssize_t relay_file_read(struct file *filp,
+ char __user *buffer,
+ size_t count,
+ loff_t *ppos)
{
struct rchan_buf *buf = filp->private_data;
size_t read_start, avail;
+ size_t written = 0;
int ret;
- if (!desc->count)
+ if (!count)
return 0;
inode_lock(file_inode(filp));
do {
+ void *from;
+
if (!relay_file_read_avail(buf, *ppos))
break;
@@ -1174,32 +1137,22 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
if (!avail)
break;
- avail = min(desc->count, avail);
- ret = subbuf_actor(read_start, buf, avail, desc);
- if (desc->error < 0)
+ avail = min(count, avail);
+ from = buf->start + read_start;
+ ret = avail;
+ if (copy_to_user(buffer, from, avail))
break;
- if (ret) {
- relay_file_read_consume(buf, read_start, ret);
- *ppos = relay_file_read_end_pos(buf, read_start, ret);
- }
- } while (desc->count && ret);
- inode_unlock(file_inode(filp));
+ buffer += ret;
+ written += ret;
+ count -= ret;
- return desc->written;
-}
+ relay_file_read_consume(buf, read_start, ret);
+ *ppos = relay_file_read_end_pos(buf, read_start, ret);
+ } while (count);
+ inode_unlock(file_inode(filp));
-static ssize_t relay_file_read(struct file *filp,
- char __user *buffer,
- size_t count,
- loff_t *ppos)
-{
- read_descriptor_t desc;
- desc.written = 0;
- desc.count = count;
- desc.arg.buf = buffer;
- desc.error = 0;
- return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc);
+ return written;
}
static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
@@ -1377,12 +1330,3 @@ const struct file_operations relay_file_operations = {
.splice_read = relay_file_splice_read,
};
EXPORT_SYMBOL_GPL(relay_file_operations);
-
-static __init int relay_init(void)
-{
-
- hotcpu_notifier(relay_hotcpu_callback, 0);
- return 0;
-}
-
-early_initcall(relay_init);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 5e59b832ae2b..89ab6758667b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -18,8 +18,8 @@ endif
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
obj-y += wait.o swait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
-obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o
+obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/autogroup.c
index a5d966cb8891..da39489d2d80 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/autogroup.c
@@ -111,10 +111,13 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
{
if (tg != &root_task_group)
return false;
-
/*
- * We can only assume the task group can't go away on us if
- * autogroup_move_group() can see us on ->thread_group list.
+ * If we race with autogroup_move_group() the caller can use the old
+ * value of signal->autogroup but in this case sched_move_task() will
+ * be called again before autogroup_kref_put().
+ *
+ * However, there is no way sched_autogroup_exit_task() could tell us
+ * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case.
*/
if (p->flags & PF_EXITING)
return false;
@@ -122,6 +125,16 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
return true;
}
+void sched_autogroup_exit_task(struct task_struct *p)
+{
+ /*
+ * We are going to call exit_notify() and autogroup_move_group() can't
+ * see this thread after that: we can no longer use signal->autogroup.
+ * See the PF_EXITING check in task_wants_autogroup().
+ */
+ sched_move_task(p);
+}
+
static void
autogroup_move_group(struct task_struct *p, struct autogroup *ag)
{
@@ -138,13 +151,20 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
}
p->signal->autogroup = autogroup_kref_get(ag);
-
- if (!READ_ONCE(sysctl_sched_autogroup_enabled))
- goto out;
-
+ /*
+ * We can't avoid sched_move_task() after we changed signal->autogroup,
+ * this process can already run with task_group() == prev->tg or we can
+ * race with cgroup code which can read autogroup = prev under rq->lock.
+ * In the latter case for_each_thread() can not miss a migrating thread,
+ * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
+ * can't be removed from thread list, we hold ->siglock.
+ *
+ * If an exiting thread was already removed from thread list we rely on
+ * sched_autogroup_exit_task().
+ */
for_each_thread(p, t)
sched_move_task(t);
-out:
+
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
}
@@ -192,6 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
{
static unsigned long next = INITIAL_JIFFIES;
struct autogroup *ag;
+ unsigned long shares;
int err;
if (nice < MIN_NICE || nice > MAX_NICE)
@@ -210,9 +231,10 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
next = HZ / 10 + jiffies;
ag = autogroup_task_get(p);
+ shares = scale_load(sched_prio_to_weight[nice + 20]);
down_write(&ag->lock);
- err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]);
+ err = sched_group_set_shares(ag->tg, shares);
if (!err)
ag->nice = nice;
up_write(&ag->lock);
diff --git a/kernel/sched/auto_group.h b/kernel/sched/autogroup.h
index 890c95f2587a..890c95f2587a 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/autogroup.h
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e85a725e5c34..ad64efe41722 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -77,41 +77,88 @@ EXPORT_SYMBOL_GPL(sched_clock);
__read_mostly int sched_clock_running;
+void sched_clock_init(void)
+{
+ sched_clock_running = 1;
+}
+
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static struct static_key __sched_clock_stable = STATIC_KEY_INIT;
-static int __sched_clock_stable_early;
+/*
+ * We must start with !__sched_clock_stable because the unstable -> stable
+ * transition is accurate, while the stable -> unstable transition is not.
+ *
+ * Similarly we start with __sched_clock_stable_early, thereby assuming we
+ * will become stable, such that there's only a single 1 -> 0 transition.
+ */
+static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
+static int __sched_clock_stable_early = 1;
-int sched_clock_stable(void)
+/*
+ * We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset
+ */
+static __read_mostly u64 raw_offset;
+static __read_mostly u64 gtod_offset;
+
+struct sched_clock_data {
+ u64 tick_raw;
+ u64 tick_gtod;
+ u64 clock;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
+
+static inline struct sched_clock_data *this_scd(void)
{
- return static_key_false(&__sched_clock_stable);
+ return this_cpu_ptr(&sched_clock_data);
}
-static void __set_sched_clock_stable(void)
+static inline struct sched_clock_data *cpu_sdc(int cpu)
{
- if (!sched_clock_stable())
- static_key_slow_inc(&__sched_clock_stable);
+ return &per_cpu(sched_clock_data, cpu);
+}
- tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
+int sched_clock_stable(void)
+{
+ return static_branch_likely(&__sched_clock_stable);
}
-void set_sched_clock_stable(void)
+static void __set_sched_clock_stable(void)
{
- __sched_clock_stable_early = 1;
+ struct sched_clock_data *scd = this_scd();
- smp_mb(); /* matches sched_clock_init() */
+ /*
+ * Attempt to make the (initial) unstable->stable transition continuous.
+ */
+ raw_offset = (scd->tick_gtod + gtod_offset) - (scd->tick_raw);
- if (!sched_clock_running)
- return;
+ printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
+ scd->tick_gtod, gtod_offset,
+ scd->tick_raw, raw_offset);
- __set_sched_clock_stable();
+ static_branch_enable(&__sched_clock_stable);
+ tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
}
static void __clear_sched_clock_stable(struct work_struct *work)
{
- /* XXX worry about clock continuity */
- if (sched_clock_stable())
- static_key_slow_dec(&__sched_clock_stable);
+ struct sched_clock_data *scd = this_scd();
+
+ /*
+ * Attempt to make the stable->unstable transition continuous.
+ *
+ * Trouble is, this is typically called from the TSC watchdog
+ * timer, which is late per definition. This means the tick
+ * values can already be screwy.
+ *
+ * Still do what we can.
+ */
+ gtod_offset = (scd->tick_raw + raw_offset) - (scd->tick_gtod);
+
+ printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
+ scd->tick_gtod, gtod_offset,
+ scd->tick_raw, raw_offset);
+ static_branch_disable(&__sched_clock_stable);
tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
}
@@ -121,47 +168,15 @@ void clear_sched_clock_stable(void)
{
__sched_clock_stable_early = 0;
- smp_mb(); /* matches sched_clock_init() */
-
- if (!sched_clock_running)
- return;
+ smp_mb(); /* matches sched_clock_init_late() */
- schedule_work(&sched_clock_work);
+ if (sched_clock_running == 2)
+ schedule_work(&sched_clock_work);
}
-struct sched_clock_data {
- u64 tick_raw;
- u64 tick_gtod;
- u64 clock;
-};
-
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
-
-static inline struct sched_clock_data *this_scd(void)
+void sched_clock_init_late(void)
{
- return this_cpu_ptr(&sched_clock_data);
-}
-
-static inline struct sched_clock_data *cpu_sdc(int cpu)
-{
- return &per_cpu(sched_clock_data, cpu);
-}
-
-void sched_clock_init(void)
-{
- u64 ktime_now = ktime_to_ns(ktime_get());
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct sched_clock_data *scd = cpu_sdc(cpu);
-
- scd->tick_raw = 0;
- scd->tick_gtod = ktime_now;
- scd->clock = ktime_now;
- }
-
- sched_clock_running = 1;
-
+ sched_clock_running = 2;
/*
* Ensure that it is impossible to not do a static_key update.
*
@@ -173,8 +188,6 @@ void sched_clock_init(void)
if (__sched_clock_stable_early)
__set_sched_clock_stable();
- else
- __clear_sched_clock_stable(NULL);
}
/*
@@ -216,7 +229,7 @@ again:
* scd->tick_gtod + TICK_NSEC);
*/
- clock = scd->tick_gtod + delta;
+ clock = scd->tick_gtod + gtod_offset + delta;
min_clock = wrap_max(scd->tick_gtod, old_clock);
max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
@@ -302,7 +315,7 @@ u64 sched_clock_cpu(int cpu)
u64 clock;
if (sched_clock_stable())
- return sched_clock();
+ return sched_clock() + raw_offset;
if (unlikely(!sched_clock_running))
return 0ull;
@@ -323,23 +336,22 @@ EXPORT_SYMBOL_GPL(sched_clock_cpu);
void sched_clock_tick(void)
{
struct sched_clock_data *scd;
- u64 now, now_gtod;
-
- if (sched_clock_stable())
- return;
-
- if (unlikely(!sched_clock_running))
- return;
WARN_ON_ONCE(!irqs_disabled());
+ /*
+ * Update these values even if sched_clock_stable(), because it can
+ * become unstable at any point in time at which point we need some
+ * values to fall back on.
+ *
+ * XXX arguably we can skip this if we expose tsc_clocksource_reliable
+ */
scd = this_scd();
- now_gtod = ktime_to_ns(ktime_get());
- now = sched_clock();
+ scd->tick_raw = sched_clock();
+ scd->tick_gtod = ktime_get_ns();
- scd->tick_raw = now;
- scd->tick_gtod = now_gtod;
- sched_clock_local(scd);
+ if (!sched_clock_stable() && likely(sched_clock_running))
+ sched_clock_local(scd);
}
/*
@@ -366,11 +378,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
-void sched_clock_init(void)
-{
- sched_clock_running = 1;
-}
-
u64 sched_clock_cpu(int cpu)
{
if (unlikely(!sched_clock_running))
@@ -378,6 +385,7 @@ u64 sched_clock_cpu(int cpu)
return sched_clock();
}
+
#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
/*
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 8d0f35debf35..f063a25d4449 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -31,7 +31,8 @@ void complete(struct completion *x)
unsigned long flags;
spin_lock_irqsave(&x->wait.lock, flags);
- x->done++;
+ if (x->done != UINT_MAX)
+ x->done++;
__wake_up_locked(&x->wait, TASK_NORMAL, 1);
spin_unlock_irqrestore(&x->wait.lock, flags);
}
@@ -51,7 +52,7 @@ void complete_all(struct completion *x)
unsigned long flags;
spin_lock_irqsave(&x->wait.lock, flags);
- x->done += UINT_MAX/2;
+ x->done = UINT_MAX;
__wake_up_locked(&x->wait, TASK_NORMAL, 0);
spin_unlock_irqrestore(&x->wait.lock, flags);
}
@@ -79,7 +80,8 @@ do_wait_for_common(struct completion *x,
if (!x->done)
return timeout;
}
- x->done--;
+ if (x->done != UINT_MAX)
+ x->done--;
return timeout ?: 1;
}
@@ -280,7 +282,7 @@ bool try_wait_for_completion(struct completion *x)
spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
ret = 0;
- else
+ else if (x->done != UINT_MAX)
x->done--;
spin_unlock_irqrestore(&x->wait.lock, flags);
return ret;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 44817c640e99..e1ae6ac15eac 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1,85 +1,28 @@
/*
* kernel/sched/core.c
*
- * Kernel scheduler and related syscalls
+ * Core kernel scheduler code and related syscalls
*
* Copyright (C) 1991-2002 Linus Torvalds
- *
- * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
- * make semaphores SMP safe
- * 1998-11-19 Implemented schedule_timeout() and related stuff
- * by Andrea Arcangeli
- * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
- * hybrid priority-list and round-robin design with
- * an array-switch method of distributing timeslices
- * and per-CPU runqueues. Cleanups and useful suggestions
- * by Davide Libenzi, preemptible kernel bits by Robert Love.
- * 2003-09-03 Interactivity tuning by Con Kolivas.
- * 2004-04-02 Scheduler domains code by Nick Piggin
- * 2007-04-15 Work begun on replacing all interactivity tuning with a
- * fair scheduling design by Con Kolivas.
- * 2007-05-05 Load balancing (smp-nice) and other improvements
- * by Peter Williams
- * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
- * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
- * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
- * Thomas Gleixner, Mike Kravetz
*/
-
-#include <linux/kasan.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/nmi.h>
-#include <linux/init.h>
-#include <linux/uaccess.h>
-#include <linux/highmem.h>
-#include <linux/mmu_context.h>
-#include <linux/interrupt.h>
-#include <linux/capability.h>
-#include <linux/completion.h>
-#include <linux/kernel_stat.h>
-#include <linux/debug_locks.h>
-#include <linux/perf_event.h>
-#include <linux/security.h>
-#include <linux/notifier.h>
-#include <linux/profile.h>
-#include <linux/freezer.h>
-#include <linux/vmalloc.h>
-#include <linux/blkdev.h>
-#include <linux/delay.h>
-#include <linux/pid_namespace.h>
-#include <linux/smp.h>
-#include <linux/threads.h>
-#include <linux/timer.h>
-#include <linux/rcupdate.h>
-#include <linux/cpu.h>
+#include <linux/sched.h>
#include <linux/cpuset.h>
-#include <linux/percpu.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/sysctl.h>
-#include <linux/syscalls.h>
-#include <linux/times.h>
-#include <linux/tsacct_kern.h>
-#include <linux/kprobes.h>
#include <linux/delayacct.h>
-#include <linux/unistd.h>
-#include <linux/pagemap.h>
-#include <linux/hrtimer.h>
-#include <linux/tick.h>
-#include <linux/ctype.h>
-#include <linux/ftrace.h>
-#include <linux/slab.h>
#include <linux/init_task.h>
#include <linux/context_tracking.h>
-#include <linux/compiler.h>
-#include <linux/frame.h>
+
+#include <linux/blkdev.h>
+#include <linux/kprobes.h>
+#include <linux/mmu_context.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
#include <linux/prefetch.h>
+#include <linux/profile.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
-#include <asm/irq_regs.h>
-#include <asm/mutex.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
@@ -91,27 +34,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
-DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static void update_rq_clock_task(struct rq *rq, s64 delta);
-
-void update_rq_clock(struct rq *rq)
-{
- s64 delta;
-
- lockdep_assert_held(&rq->lock);
-
- if (rq->clock_skip_update & RQCF_ACT_SKIP)
- return;
-
- delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
- if (delta < 0)
- return;
- rq->clock += delta;
- update_rq_clock_task(rq, delta);
-}
-
/*
* Debugging: various feature bits
*/
@@ -140,7 +64,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
/*
- * period over which we measure -rt task cpu usage in us.
+ * period over which we measure -rt task CPU usage in us.
* default: 1s
*/
unsigned int sysctl_sched_rt_period = 1000000;
@@ -153,7 +77,7 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
-/* cpus with isolated domains */
+/* CPUs with isolated domains */
cpumask_var_t cpu_isolated_map;
/*
@@ -185,7 +109,7 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
rq = task_rq(p);
raw_spin_lock(&rq->lock);
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
- rf->cookie = lockdep_pin_lock(&rq->lock);
+ rq_pin_lock(rq, rf);
return rq;
}
raw_spin_unlock(&rq->lock);
@@ -221,11 +145,11 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
* If we observe the old cpu in task_rq_lock, the acquire of
* the old rq->lock will fully serialize against the stores.
*
- * If we observe the new cpu in task_rq_lock, the acquire will
+ * If we observe the new CPU in task_rq_lock, the acquire will
* pair with the WMB to ensure we must then also see migrating.
*/
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
- rf->cookie = lockdep_pin_lock(&rq->lock);
+ rq_pin_lock(rq, rf);
return rq;
}
raw_spin_unlock(&rq->lock);
@@ -236,6 +160,84 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
}
}
+/*
+ * RQ-clock updating methods:
+ */
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ s64 steal = 0, irq_delta = 0;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}irq region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}irq
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
+
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ if (static_key_false((&paravirt_steal_rq_enabled))) {
+ steal = paravirt_steal_clock(cpu_of(rq));
+ steal -= rq->prev_steal_time_rq;
+
+ if (unlikely(steal > delta))
+ steal = delta;
+
+ rq->prev_steal_time_rq += steal;
+ delta -= steal;
+ }
+#endif
+
+ rq->clock_task += delta;
+
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
+ sched_rt_avg_update(rq, irq_delta + steal);
+#endif
+}
+
+void update_rq_clock(struct rq *rq)
+{
+ s64 delta;
+
+ lockdep_assert_held(&rq->lock);
+
+ if (rq->clock_update_flags & RQCF_ACT_SKIP)
+ return;
+
+#ifdef CONFIG_SCHED_DEBUG
+ rq->clock_update_flags |= RQCF_UPDATED;
+#endif
+ delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ if (delta < 0)
+ return;
+ rq->clock += delta;
+ update_rq_clock_task(rq, delta);
+}
+
+
#ifdef CONFIG_SCHED_HRTICK
/*
* Use HR-timers to deliver accurate preemption points.
@@ -458,7 +460,7 @@ void wake_up_q(struct wake_q_head *head)
task = container_of(node, struct task_struct, wake_q);
BUG_ON(!task);
- /* task can safely be re-inserted now */
+ /* Task can safely be re-inserted now: */
node = node->next;
task->wake_q.next = NULL;
@@ -516,12 +518,12 @@ void resched_cpu(int cpu)
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ_COMMON
/*
- * In the semi idle case, use the nearest busy cpu for migrating timers
- * from an idle cpu. This is good for power-savings.
+ * In the semi idle case, use the nearest busy CPU for migrating timers
+ * from an idle CPU. This is good for power-savings.
*
* We don't do similar optimization for completely idle system, as
- * selecting an idle cpu will add more delays to the timers than intended
- * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ * selecting an idle CPU will add more delays to the timers than intended
+ * (as that CPU's timer base may not be uptodate wrt jiffies etc).
*/
int get_nohz_timer_target(void)
{
@@ -550,6 +552,7 @@ unlock:
rcu_read_unlock();
return cpu;
}
+
/*
* When add_timer_on() enqueues a timer into the timer wheel of an
* idle CPU then this timer might expire before the next timer event
@@ -581,6 +584,8 @@ static bool wake_up_full_nohz_cpu(int cpu)
* If needed we can still optimize that later with an
* empty IRQ.
*/
+ if (cpu_is_offline(cpu))
+ return true; /* Don't try to wake offline CPUs. */
if (tick_nohz_full_cpu(cpu)) {
if (cpu != smp_processor_id() ||
tick_nohz_tick_stopped())
@@ -591,6 +596,11 @@ static bool wake_up_full_nohz_cpu(int cpu)
return false;
}
+/*
+ * Wake up the specified CPU. If the CPU is going offline, it is the
+ * caller's responsibility to deal with the lost wakeup, for example,
+ * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
+ */
void wake_up_nohz_cpu(int cpu)
{
if (!wake_up_full_nohz_cpu(cpu))
@@ -777,60 +787,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-/*
- * In theory, the compile should just see 0 here, and optimize out the call
- * to sched_rt_avg_update. But I don't trust it...
- */
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
- s64 steal = 0, irq_delta = 0;
-#endif
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
-
- /*
- * Since irq_time is only updated on {soft,}irq_exit, we might run into
- * this case when a previous update_rq_clock() happened inside a
- * {soft,}irq region.
- *
- * When this happens, we stop ->clock_task and only update the
- * prev_irq_time stamp to account for the part that fit, so that a next
- * update will consume the rest. This ensures ->clock_task is
- * monotonic.
- *
- * It does however cause some slight miss-attribution of {soft,}irq
- * time, a more accurate solution would be to update the irq_time using
- * the current rq->clock timestamp, except that would require using
- * atomic ops.
- */
- if (irq_delta > delta)
- irq_delta = delta;
-
- rq->prev_irq_time += irq_delta;
- delta -= irq_delta;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- if (static_key_false((&paravirt_steal_rq_enabled))) {
- steal = paravirt_steal_clock(cpu_of(rq));
- steal -= rq->prev_steal_time_rq;
-
- if (unlikely(steal > delta))
- steal = delta;
-
- rq->prev_steal_time_rq += steal;
- delta -= steal;
- }
-#endif
-
- rq->clock_task += delta;
-
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
- if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
- sched_rt_avg_update(rq, irq_delta + steal);
-#endif
-}
-
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -1011,7 +967,7 @@ struct migration_arg {
};
/*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * Move (not current) task off this CPU, onto the destination CPU. We're doing
* this because either it can't run here any more (set_cpus_allowed()
* away from this CPU, or CPU going down), or because we're
* attempting to rebalance this task on exec (sched_exec).
@@ -1045,8 +1001,8 @@ static int migration_cpu_stop(void *data)
struct rq *rq = this_rq();
/*
- * The original target cpu might have gone down and we might
- * be on another cpu but it doesn't matter.
+ * The original target CPU might have gone down and we might
+ * be on another CPU but it doesn't matter.
*/
local_irq_disable();
/*
@@ -1063,8 +1019,12 @@ static int migration_cpu_stop(void *data)
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
* we're holding p->pi_lock.
*/
- if (task_rq(p) == rq && task_on_rq_queued(p))
- rq = __migrate_task(rq, p, arg->dest_cpu);
+ if (task_rq(p) == rq) {
+ if (task_on_rq_queued(p))
+ rq = __migrate_task(rq, p, arg->dest_cpu);
+ else
+ p->wake_cpu = arg->dest_cpu;
+ }
raw_spin_unlock(&rq->lock);
raw_spin_unlock(&p->pi_lock);
@@ -1105,10 +1065,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
p->sched_class->set_cpus_allowed(p, new_mask);
- if (running)
- p->sched_class->set_curr_task(rq);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE);
+ if (running)
+ set_curr_task(rq, p);
}
/*
@@ -1160,7 +1120,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (p->flags & PF_KTHREAD) {
/*
* For kernel threads that do indeed end up on online &&
- * !active we want to ensure they are strict per-cpu threads.
+ * !active we want to ensure they are strict per-CPU threads.
*/
WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
!cpumask_intersects(new_mask, cpu_active_mask) &&
@@ -1184,9 +1144,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
* OK, since we're going to drop the lock immediately
* afterwards anyway.
*/
- lockdep_unpin_lock(&rq->lock, rf.cookie);
+ rq_unpin_lock(rq, &rf);
rq = move_queued_task(rq, p, dest_cpu);
- lockdep_repin_lock(&rq->lock, rf.cookie);
+ rq_repin_lock(rq, &rf);
}
out:
task_rq_unlock(rq, p, &rf);
@@ -1265,7 +1225,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
/*
* Task isn't running anymore; make it appear like we migrated
* it before it went to sleep. This means on wakeup we make the
- * previous cpu our targer instead of where it really is.
+ * previous CPU our target instead of where it really is.
*/
p->wake_cpu = cpu;
}
@@ -1445,7 +1405,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* yield - it could be a while.
*/
if (unlikely(queued)) {
- ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+ ktime_t to = NSEC_PER_SEC / HZ;
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_hrtimeout(&to, HRTIMER_MODE_REL);
@@ -1497,12 +1457,12 @@ EXPORT_SYMBOL_GPL(kick_process);
*
* - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
* see __set_cpus_allowed_ptr(). At this point the newly online
- * cpu isn't yet part of the sched domains, and balancing will not
+ * CPU isn't yet part of the sched domains, and balancing will not
* see it.
*
- * - on cpu-down we clear cpu_active() to mask the sched domains and
+ * - on CPU-down we clear cpu_active() to mask the sched domains and
* avoid the load balancer to place new tasks on the to be removed
- * cpu. Existing tasks will remain running there and will be taken
+ * CPU. Existing tasks will remain running there and will be taken
* off.
*
* This means that fallback selection must not select !active CPUs.
@@ -1518,9 +1478,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
int dest_cpu;
/*
- * If the node that the cpu is on has been offlined, cpu_to_node()
- * will return -1. There is no cpu on the node, and we should
- * select the cpu on the other node.
+ * If the node that the CPU is on has been offlined, cpu_to_node()
+ * will return -1. There is no CPU on the node, and we should
+ * select the CPU on the other node.
*/
if (nid != -1) {
nodemask = cpumask_of_node(nid);
@@ -1552,7 +1512,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
state = possible;
break;
}
- /* fall-through */
+ /* Fall-through */
case possible:
do_set_cpus_allowed(p, cpu_possible_mask);
state = fail;
@@ -1596,7 +1556,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
/*
* In order not to call set_task_cpu() on a blocking task we need
* to rely on ttwu() to place the task on a valid ->cpus_allowed
- * cpu.
+ * CPU.
*
* Since this is common to all placement strategies, this lives here.
*
@@ -1629,23 +1589,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{
-#ifdef CONFIG_SCHEDSTATS
- struct rq *rq = this_rq();
+ struct rq *rq;
-#ifdef CONFIG_SMP
- int this_cpu = smp_processor_id();
+ if (!schedstat_enabled())
+ return;
+
+ rq = this_rq();
- if (cpu == this_cpu) {
- schedstat_inc(rq, ttwu_local);
- schedstat_inc(p, se.statistics.nr_wakeups_local);
+#ifdef CONFIG_SMP
+ if (cpu == rq->cpu) {
+ schedstat_inc(rq->ttwu_local);
+ schedstat_inc(p->se.statistics.nr_wakeups_local);
} else {
struct sched_domain *sd;
- schedstat_inc(p, se.statistics.nr_wakeups_remote);
+ schedstat_inc(p->se.statistics.nr_wakeups_remote);
rcu_read_lock();
- for_each_domain(this_cpu, sd) {
+ for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- schedstat_inc(sd, ttwu_wake_remote);
+ schedstat_inc(sd->ttwu_wake_remote);
break;
}
}
@@ -1653,17 +1615,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
}
if (wake_flags & WF_MIGRATED)
- schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-
+ schedstat_inc(p->se.statistics.nr_wakeups_migrate);
#endif /* CONFIG_SMP */
- schedstat_inc(rq, ttwu_count);
- schedstat_inc(p, se.statistics.nr_wakeups);
+ schedstat_inc(rq->ttwu_count);
+ schedstat_inc(p->se.statistics.nr_wakeups);
if (wake_flags & WF_SYNC)
- schedstat_inc(p, se.statistics.nr_wakeups_sync);
-
-#endif /* CONFIG_SCHEDSTATS */
+ schedstat_inc(p->se.statistics.nr_wakeups_sync);
}
static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
@@ -1671,7 +1630,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
activate_task(rq, p, en_flags);
p->on_rq = TASK_ON_RQ_QUEUED;
- /* if a worker is waking up, notify workqueue */
+ /* If a worker is waking up, notify the workqueue: */
if (p->flags & PF_WQ_WORKER)
wq_worker_waking_up(p, cpu_of(rq));
}
@@ -1680,7 +1639,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
* Mark the task runnable and perform wakeup-preemption.
*/
static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
- struct pin_cookie cookie)
+ struct rq_flags *rf)
{
check_preempt_curr(rq, p, wake_flags);
p->state = TASK_RUNNING;
@@ -1692,9 +1651,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
* Our task @p is fully woken up and running; so its safe to
* drop the rq->lock, hereafter rq is only used for statistics.
*/
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, rf);
p->sched_class->task_woken(rq, p);
- lockdep_repin_lock(&rq->lock, cookie);
+ rq_repin_lock(rq, rf);
}
if (rq->idle_stamp) {
@@ -1713,7 +1672,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
- struct pin_cookie cookie)
+ struct rq_flags *rf)
{
int en_flags = ENQUEUE_WAKEUP;
@@ -1728,7 +1687,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
#endif
ttwu_activate(rq, p, en_flags);
- ttwu_do_wakeup(rq, p, wake_flags, cookie);
+ ttwu_do_wakeup(rq, p, wake_flags, rf);
}
/*
@@ -1747,7 +1706,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
if (task_on_rq_queued(p)) {
/* check_preempt_curr() may use rq clock */
update_rq_clock(rq);
- ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);
+ ttwu_do_wakeup(rq, p, wake_flags, &rf);
ret = 1;
}
__task_rq_unlock(rq, &rf);
@@ -1760,15 +1719,15 @@ void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
- struct pin_cookie cookie;
struct task_struct *p;
unsigned long flags;
+ struct rq_flags rf;
if (!llist)
return;
raw_spin_lock_irqsave(&rq->lock, flags);
- cookie = lockdep_pin_lock(&rq->lock);
+ rq_pin_lock(rq, &rf);
while (llist) {
int wake_flags = 0;
@@ -1779,10 +1738,10 @@ void sched_ttwu_pending(void)
if (p->sched_remote_wakeup)
wake_flags = WF_MIGRATED;
- ttwu_do_activate(rq, p, wake_flags, cookie);
+ ttwu_do_activate(rq, p, wake_flags, &rf);
}
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, &rf);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -1854,7 +1813,7 @@ void wake_up_if_idle(int cpu)
raw_spin_lock_irqsave(&rq->lock, flags);
if (is_idle_task(rq->curr))
smp_send_reschedule(cpu);
- /* Else cpu is not in idle, do nothing here */
+ /* Else CPU is not idle, do nothing here: */
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -1871,20 +1830,20 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
- struct pin_cookie cookie;
+ struct rq_flags rf;
#if defined(CONFIG_SMP)
if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
- sched_clock_cpu(cpu); /* sync clocks x-cpu */
+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
ttwu_queue_remote(p, cpu, wake_flags);
return;
}
#endif
raw_spin_lock(&rq->lock);
- cookie = lockdep_pin_lock(&rq->lock);
- ttwu_do_activate(rq, p, wake_flags, cookie);
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_pin_lock(rq, &rf);
+ ttwu_do_activate(rq, p, wake_flags, &rf);
+ rq_unpin_lock(rq, &rf);
raw_spin_unlock(&rq->lock);
}
@@ -1894,8 +1853,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* MIGRATION
*
* The basic program-order guarantee on SMP systems is that when a task [t]
- * migrates, all its activity on its old cpu [c0] happens-before any subsequent
- * execution on its new cpu [c1].
+ * migrates, all its activity on its old CPU [c0] happens-before any subsequent
+ * execution on its new CPU [c1].
*
* For migration (of runnable tasks) this is provided by the following means:
*
@@ -1906,7 +1865,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
*
* Transitivity guarantees that B happens after A and C after B.
* Note: we only require RCpc transitivity.
- * Note: the cpu doing B need not be c0 or c1
+ * Note: the CPU doing B need not be c0 or c1
*
* Example:
*
@@ -1985,14 +1944,15 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* @state: the mask of task states that can be woken
* @wake_flags: wake modifier flags (WF_*)
*
- * Put it on the run-queue if it's not already there. The "current"
- * thread is always on the run-queue (except when the actual
- * re-schedule is in progress), and as such you're allowed to do
- * the simpler "current->state = TASK_RUNNING" to mark yourself
- * runnable without the overhead of this.
+ * If (@state & @p->state) @p->state = TASK_RUNNING.
+ *
+ * If the task was not queued/runnable, also place it back on a runqueue.
+ *
+ * Atomic against schedule() which would dequeue a task, also see
+ * set_current_state().
*
- * Return: %true if @p was woken up, %false if it was already running.
- * or @state didn't match @p's state.
+ * Return: %true if @p->state changes (an actual wakeup was done),
+ * %false otherwise.
*/
static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
@@ -2013,7 +1973,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
trace_sched_waking(p);
- success = 1; /* we're going to change ->state */
+ /* We're going to change ->state: */
+ success = 1;
cpu = task_cpu(p);
/*
@@ -2062,7 +2023,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
smp_rmb();
/*
- * If the owning (remote) cpu is still in the middle of schedule() with
+ * If the owning (remote) CPU is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
*
* Pairs with the smp_store_release() in finish_lock_switch().
@@ -2075,17 +2036,29 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
+ if (p->in_iowait) {
+ delayacct_blkio_end();
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
}
+
+#else /* CONFIG_SMP */
+
+ if (p->in_iowait) {
+ delayacct_blkio_end();
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
stat:
- if (schedstat_enabled())
- ttwu_stat(p, cpu, wake_flags);
+ ttwu_stat(p, cpu, wake_flags);
out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -2095,12 +2068,13 @@ out:
/**
* try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened
+ * @cookie: context's cookie for pinning
*
* Put @p on the run-queue if it's not already there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not
* the current task.
*/
-static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
+static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
{
struct rq *rq = task_rq(p);
@@ -2117,11 +2091,11 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
* disabled avoiding further scheduler activity on it and we've
* not yet picked a replacement task.
*/
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, rf);
raw_spin_unlock(&rq->lock);
raw_spin_lock(&p->pi_lock);
raw_spin_lock(&rq->lock);
- lockdep_repin_lock(&rq->lock, cookie);
+ rq_repin_lock(rq, rf);
}
if (!(p->state & TASK_NORMAL))
@@ -2129,12 +2103,16 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
trace_sched_waking(p);
- if (!task_on_rq_queued(p))
+ if (!task_on_rq_queued(p)) {
+ if (p->in_iowait) {
+ delayacct_blkio_end();
+ atomic_dec(&rq->nr_iowait);
+ }
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ }
- ttwu_do_wakeup(rq, p, 0, cookie);
- if (schedstat_enabled())
- ttwu_stat(p, smp_processor_id(), 0);
+ ttwu_do_wakeup(rq, p, 0, rf);
+ ttwu_stat(p, smp_processor_id(), 0);
out:
raw_spin_unlock(&p->pi_lock);
}
@@ -2417,7 +2395,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
/*
- * We're setting the cpu for the first time, we don't migrate,
+ * We're setting the CPU for the first time, we don't migrate,
* so use __set_task_cpu().
*/
__set_task_cpu(p, cpu);
@@ -2560,7 +2538,7 @@ void wake_up_new_task(struct task_struct *p)
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
- * - any previously selected cpu might disappear through hotplug
+ * - any previously selected CPU might disappear through hotplug
*
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
* as we're not fully set-up yet.
@@ -2568,6 +2546,7 @@ void wake_up_new_task(struct task_struct *p)
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
+ update_rq_clock(rq);
post_init_entity_util_avg(&p->se);
activate_task(rq, p, 0);
@@ -2580,9 +2559,9 @@ void wake_up_new_task(struct task_struct *p)
* Nothing relies on rq->lock after this, so its fine to
* drop it.
*/
- lockdep_unpin_lock(&rq->lock, rf.cookie);
+ rq_unpin_lock(rq, &rf);
p->sched_class->task_woken(rq, p);
- lockdep_repin_lock(&rq->lock, rf.cookie);
+ rq_repin_lock(rq, &rf);
}
#endif
task_rq_unlock(rq, p, &rf);
@@ -2772,6 +2751,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* task and put them back on the free list.
*/
kprobe_flush_task(prev);
+
+ /* Task is done with its stack. */
+ put_task_stack(prev);
+
put_task_struct(prev);
}
@@ -2847,7 +2830,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
*/
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
- struct task_struct *next, struct pin_cookie cookie)
+ struct task_struct *next, struct rq_flags *rf)
{
struct mm_struct *mm, *oldmm;
@@ -2873,13 +2856,16 @@ context_switch(struct rq *rq, struct task_struct *prev,
prev->active_mm = NULL;
rq->prev_mm = oldmm;
}
+
+ rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
+
/*
* Since the runqueue lock will be released by the next
* task (which is an invalid locking op but in the case
* of the scheduler it's an obvious special-case), so we
* do an early lockdep release here:
*/
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, rf);
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
/* Here we just switch the register state and the stack. */
@@ -2906,7 +2892,7 @@ unsigned long nr_running(void)
}
/*
- * Check if only the current task is running on the cpu.
+ * Check if only the current task is running on the CPU.
*
* Caution: this function does not check that the caller has disabled
* preemption, thus the result might have a time-of-check-to-time-of-use
@@ -2935,6 +2921,36 @@ unsigned long long nr_context_switches(void)
return sum;
}
+/*
+ * IO-wait accounting, and how its mostly bollocks (on SMP).
+ *
+ * The idea behind IO-wait account is to account the idle time that we could
+ * have spend running if it were not for IO. That is, if we were to improve the
+ * storage performance, we'd have a proportional reduction in IO-wait time.
+ *
+ * This all works nicely on UP, where, when a task blocks on IO, we account
+ * idle time as IO-wait, because if the storage were faster, it could've been
+ * running and we'd not be idle.
+ *
+ * This has been extended to SMP, by doing the same for each CPU. This however
+ * is broken.
+ *
+ * Imagine for instance the case where two tasks block on one CPU, only the one
+ * CPU will have IO-wait accounted, while the other has regular idle. Even
+ * though, if the storage were faster, both could've ran at the same time,
+ * utilising both CPUs.
+ *
+ * This means, that when looking globally, the current IO-wait accounting on
+ * SMP is a lower bound, by reason of under accounting.
+ *
+ * Worse, since the numbers are provided per CPU, they are sometimes
+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
+ * associated with any one particular CPU, it can wake to another CPU than it
+ * blocked on. This means the per CPU IO-wait number is meaningless.
+ *
+ * Task CPU affinities can make all that even more 'interesting'.
+ */
+
unsigned long nr_iowait(void)
{
unsigned long i, sum = 0;
@@ -2945,6 +2961,13 @@ unsigned long nr_iowait(void)
return sum;
}
+/*
+ * Consumers of these two interfaces, like for example the cpufreq menu
+ * governor are using nonsensical data. Boosting frequency for a CPU that has
+ * IO-wait which might not even end up running the task when it does become
+ * runnable.
+ */
+
unsigned long nr_iowait_cpu(int cpu)
{
struct rq *this = cpu_rq(cpu);
@@ -3028,8 +3051,8 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* So we have a optimization chance when the task's delta_exec is 0.
* Reading ->on_cpu is racy, but this is ok.
*
- * If we race with it leaving cpu, we'll take a lock. So we're correct.
- * If we race with it entering cpu, unaccounted time is 0. This is
+ * If we race with it leaving CPU, we'll take a lock. So we're correct.
+ * If we race with it entering CPU, unaccounted time is 0. This is
* indistinguishable from the read occurring a few cycles earlier.
* If we see ->on_cpu without ->on_rq, the task is leaving, and has
* been accounted, so we're correct here as well.
@@ -3192,6 +3215,9 @@ static inline void preempt_latency_stop(int val) { }
*/
static noinline void __schedule_bug(struct task_struct *prev)
{
+ /* Save this before calling printk(), since that will clobber it */
+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
+
if (oops_in_progress)
return;
@@ -3202,13 +3228,12 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
-#ifdef CONFIG_DEBUG_PREEMPT
- if (in_atomic_preempt_off()) {
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+ && in_atomic_preempt_off()) {
pr_err("Preemption disabled at:");
- print_ip_sym(current->preempt_disable_ip);
+ print_ip_sym(preempt_disable_ip);
pr_cont("\n");
}
-#endif
if (panic_on_warn)
panic("scheduling while atomic\n");
@@ -3234,38 +3259,37 @@ static inline void schedule_debug(struct task_struct *prev)
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
- schedstat_inc(this_rq(), sched_count);
+ schedstat_inc(this_rq()->sched_count);
}
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
- const struct sched_class *class = &fair_sched_class;
+ const struct sched_class *class;
struct task_struct *p;
/*
* Optimization: we know that if all tasks are in
* the fair class we can call that function directly:
*/
- if (likely(prev->sched_class == class &&
- rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq, prev, cookie);
+ if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
+ p = fair_sched_class.pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto again;
- /* assumes fair_sched_class->next == idle_sched_class */
+ /* Assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
- p = idle_sched_class.pick_next_task(rq, prev, cookie);
+ p = idle_sched_class.pick_next_task(rq, prev, rf);
return p;
}
again:
for_each_class(class) {
- p = class->pick_next_task(rq, prev, cookie);
+ p = class->pick_next_task(rq, prev, rf);
if (p) {
if (unlikely(p == RETRY_TASK))
goto again;
@@ -3273,7 +3297,8 @@ again:
}
}
- BUG(); /* the idle class will always have a runnable task */
+ /* The idle class should always have a runnable task: */
+ BUG();
}
/*
@@ -3319,7 +3344,7 @@ static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
- struct pin_cookie cookie;
+ struct rq_flags rf;
struct rq *rq;
int cpu;
@@ -3327,17 +3352,6 @@ static void __sched notrace __schedule(bool preempt)
rq = cpu_rq(cpu);
prev = rq->curr;
- /*
- * do_exit() calls schedule() with preemption disabled as an exception;
- * however we must fix that up, otherwise the next task will see an
- * inconsistent (higher) preempt count.
- *
- * It also avoids the below schedule_debug() test from complaining
- * about this.
- */
- if (unlikely(prev->state == TASK_DEAD))
- preempt_enable_no_resched_notrace();
-
schedule_debug(prev);
if (sched_feat(HRTICK))
@@ -3353,9 +3367,10 @@ static void __sched notrace __schedule(bool preempt)
*/
smp_mb__before_spinlock();
raw_spin_lock(&rq->lock);
- cookie = lockdep_pin_lock(&rq->lock);
+ rq_pin_lock(rq, &rf);
- rq->clock_skip_update <<= 1; /* promote REQ to ACT */
+ /* Promote REQ to ACT */
+ rq->clock_update_flags <<= 1;
switch_count = &prev->nivcsw;
if (!preempt && prev->state) {
@@ -3365,6 +3380,11 @@ static void __sched notrace __schedule(bool preempt)
deactivate_task(rq, prev, DEQUEUE_SLEEP);
prev->on_rq = 0;
+ if (prev->in_iowait) {
+ atomic_inc(&rq->nr_iowait);
+ delayacct_blkio_start();
+ }
+
/*
* If a worker went to sleep, notify and ask workqueue
* whether it wants to wake up a task to maintain
@@ -3375,7 +3395,7 @@ static void __sched notrace __schedule(bool preempt)
to_wakeup = wq_worker_sleeping(prev);
if (to_wakeup)
- try_to_wake_up_local(to_wakeup, cookie);
+ try_to_wake_up_local(to_wakeup, &rf);
}
}
switch_count = &prev->nvcsw;
@@ -3384,10 +3404,9 @@ static void __sched notrace __schedule(bool preempt)
if (task_on_rq_queued(prev))
update_rq_clock(rq);
- next = pick_next_task(rq, prev, cookie);
+ next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
- rq->clock_skip_update = 0;
if (likely(prev != next)) {
rq->nr_switches++;
@@ -3395,15 +3414,48 @@ static void __sched notrace __schedule(bool preempt)
++*switch_count;
trace_sched_switch(preempt, prev, next);
- rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */
+
+ /* Also unlocks the rq: */
+ rq = context_switch(rq, prev, next, &rf);
} else {
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
+ rq_unpin_lock(rq, &rf);
raw_spin_unlock_irq(&rq->lock);
}
balance_callback(rq);
}
-STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
+
+void __noreturn do_task_dead(void)
+{
+ /*
+ * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+ * when the following two conditions become true.
+ * - There is race condition of mmap_sem (It is acquired by
+ * exit_mm()), and
+ * - SMI occurs before setting TASK_RUNINNG.
+ * (or hypervisor of virtual machine switches to other guest)
+ * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+ *
+ * To avoid it, we have to wait for releasing tsk->pi_lock which
+ * is held by try_to_wake_up()
+ */
+ smp_mb();
+ raw_spin_unlock_wait(&current->pi_lock);
+
+ /* Causes final put_task_struct in finish_task_switch(): */
+ __set_current_state(TASK_DEAD);
+
+ /* Tell freezer to ignore us: */
+ current->flags |= PF_NOFREEZE;
+
+ __schedule(false);
+ BUG();
+
+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
+ for (;;)
+ cpu_relax();
+}
static inline void sched_submit_work(struct task_struct *tsk)
{
@@ -3620,6 +3672,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
BUG_ON(prio > MAX_PRIO);
rq = __task_rq_lock(p, &rf);
+ update_rq_clock(rq);
/*
* Idle task boosting is a nono in general. There is one
@@ -3687,14 +3740,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
p->prio = prio;
- if (running)
- p->sched_class->set_curr_task(rq);
if (queued)
enqueue_task(rq, p, queue_flag);
+ if (running)
+ set_curr_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
- preempt_disable(); /* avoid rq from going away on us */
+ /* Avoid rq from going away on us: */
+ preempt_disable();
__task_rq_unlock(rq, &rf);
balance_callback(rq);
@@ -3704,7 +3758,8 @@ out_unlock:
void set_user_nice(struct task_struct *p, long nice)
{
- int old_prio, delta, queued;
+ bool queued, running;
+ int old_prio, delta;
struct rq_flags rf;
struct rq *rq;
@@ -3715,6 +3770,8 @@ void set_user_nice(struct task_struct *p, long nice)
* the task might be in the middle of scheduling on another CPU.
*/
rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
@@ -3726,8 +3783,11 @@ void set_user_nice(struct task_struct *p, long nice)
goto out_unlock;
}
queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE);
+ if (running)
+ put_prev_task(rq, p);
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p);
@@ -3744,6 +3804,8 @@ void set_user_nice(struct task_struct *p, long nice)
if (delta < 0 || (delta > 0 && task_running(rq, p)))
resched_curr(rq);
}
+ if (running)
+ set_curr_task(rq, p);
out_unlock:
task_rq_unlock(rq, p, &rf);
}
@@ -3756,7 +3818,7 @@ EXPORT_SYMBOL(set_user_nice);
*/
int can_nice(const struct task_struct *p, const int nice)
{
- /* convert nice value [19,-20] to rlimit style value [1,40] */
+ /* Convert nice value [19,-20] to rlimit style value [1,40]: */
int nice_rlim = nice_to_rlimit(nice);
return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
@@ -3812,7 +3874,7 @@ int task_prio(const struct task_struct *p)
}
/**
- * idle_cpu - is a given cpu idle currently?
+ * idle_cpu - is a given CPU idle currently?
* @cpu: the processor in question.
*
* Return: 1 if the CPU is currently idle. 0 otherwise.
@@ -3836,10 +3898,10 @@ int idle_cpu(int cpu)
}
/**
- * idle_task - return the idle task for a given cpu.
+ * idle_task - return the idle task for a given CPU.
* @cpu: the processor in question.
*
- * Return: The idle task for the cpu @cpu.
+ * Return: The idle task for the CPU @cpu.
*/
struct task_struct *idle_task(int cpu)
{
@@ -4005,7 +4067,7 @@ __checkparam_dl(const struct sched_attr *attr)
}
/*
- * check the target process has a UID that matches the current process's
+ * Check the target process has a UID that matches the current process's:
*/
static bool check_same_owner(struct task_struct *p)
{
@@ -4020,8 +4082,7 @@ static bool check_same_owner(struct task_struct *p)
return match;
}
-static bool dl_param_changed(struct task_struct *p,
- const struct sched_attr *attr)
+static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
{
struct sched_dl_entity *dl_se = &p->dl;
@@ -4048,10 +4109,10 @@ static int __sched_setscheduler(struct task_struct *p,
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
struct rq *rq;
- /* may grab non-irq protected spin_locks */
+ /* May grab non-irq protected spin_locks: */
BUG_ON(in_interrupt());
recheck:
- /* double check policy once rq lock held */
+ /* Double check policy once rq lock held: */
if (policy < 0) {
reset_on_fork = p->sched_reset_on_fork;
policy = oldpolicy = p->policy;
@@ -4091,11 +4152,11 @@ recheck:
unsigned long rlim_rtprio =
task_rlimit(p, RLIMIT_RTPRIO);
- /* can't set/change the rt policy */
+ /* Can't set/change the rt policy: */
if (policy != p->policy && !rlim_rtprio)
return -EPERM;
- /* can't increase priority */
+ /* Can't increase priority: */
if (attr->sched_priority > p->rt_priority &&
attr->sched_priority > rlim_rtprio)
return -EPERM;
@@ -4119,11 +4180,11 @@ recheck:
return -EPERM;
}
- /* can't change other user's priorities */
+ /* Can't change other user's priorities: */
if (!check_same_owner(p))
return -EPERM;
- /* Normal users shall not reset the sched_reset_on_fork flag */
+ /* Normal users shall not reset the sched_reset_on_fork flag: */
if (p->sched_reset_on_fork && !reset_on_fork)
return -EPERM;
}
@@ -4135,16 +4196,17 @@ recheck:
}
/*
- * make sure no PI-waiters arrive (or leave) while we are
+ * Make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
*
* To be able to change p->policy safely, the appropriate
* runqueue lock must be held.
*/
rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
/*
- * Changing the policy of the stop threads its a very bad idea
+ * Changing the policy of the stop threads its a very bad idea:
*/
if (p == rq->stop) {
task_rq_unlock(rq, p, &rf);
@@ -4200,7 +4262,7 @@ change:
#endif
}
- /* recheck policy now with rq lock held */
+ /* Re-check policy now with rq lock held: */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
task_rq_unlock(rq, p, &rf);
@@ -4243,8 +4305,6 @@ change:
prev_class = p->sched_class;
__setscheduler(rq, p, attr, pi);
- if (running)
- p->sched_class->set_curr_task(rq);
if (queued) {
/*
* We enqueue to tail when the priority of a task is
@@ -4255,17 +4315,19 @@ change:
enqueue_task(rq, p, queue_flags);
}
+ if (running)
+ set_curr_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
- preempt_disable(); /* avoid rq from going away on us */
+
+ /* Avoid rq from going away on us: */
+ preempt_disable();
task_rq_unlock(rq, p, &rf);
if (pi)
rt_mutex_adjust_pi(p);
- /*
- * Run balance callbacks after we've adjusted the PI chain.
- */
+ /* Run balance callbacks after we've adjusted the PI chain: */
balance_callback(rq);
preempt_enable();
@@ -4358,8 +4420,7 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
/*
* Mimics kernel/events/core.c perf_copy_attr().
*/
-static int sched_copy_attr(struct sched_attr __user *uattr,
- struct sched_attr *attr)
+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
{
u32 size;
int ret;
@@ -4367,19 +4428,19 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
return -EFAULT;
- /*
- * zero the full structure, so that a short copy will be nice.
- */
+ /* Zero the full structure, so that a short copy will be nice: */
memset(attr, 0, sizeof(*attr));
ret = get_user(size, &uattr->size);
if (ret)
return ret;
- if (size > PAGE_SIZE) /* silly large */
+ /* Bail out on silly large: */
+ if (size > PAGE_SIZE)
goto err_size;
- if (!size) /* abi compat */
+ /* ABI compatibility quirk: */
+ if (!size)
size = SCHED_ATTR_SIZE_VER0;
if (size < SCHED_ATTR_SIZE_VER0)
@@ -4414,7 +4475,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
return -EFAULT;
/*
- * XXX: do we want to be lenient like existing syscalls; or do we want
+ * XXX: Do we want to be lenient like existing syscalls; or do we want
* to be strict and return an error on out-of-bounds values?
*/
attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
@@ -4434,10 +4495,8 @@ err_size:
*
* Return: 0 on success. An error code otherwise.
*/
-SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
- struct sched_param __user *, param)
+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
{
- /* negative values for policy are not valid */
if (policy < 0)
return -EINVAL;
@@ -4747,10 +4806,10 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
}
/**
- * sys_sched_setaffinity - set the cpu affinity of a process
+ * sys_sched_setaffinity - set the CPU affinity of a process
* @pid: pid of the process
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to the new cpu mask
+ * @user_mask_ptr: user-space pointer to the new CPU mask
*
* Return: 0 on success. An error code otherwise.
*/
@@ -4798,10 +4857,10 @@ out_unlock:
}
/**
- * sys_sched_getaffinity - get the cpu affinity of a process
+ * sys_sched_getaffinity - get the CPU affinity of a process
* @pid: pid of the process
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ * @user_mask_ptr: user-space pointer to hold the current CPU mask
*
* Return: size of CPU mask copied to user_mask_ptr on success. An
* error code otherwise.
@@ -4846,7 +4905,7 @@ SYSCALL_DEFINE0(sched_yield)
{
struct rq *rq = this_rq_lock();
- schedstat_inc(rq, yld_count);
+ schedstat_inc(rq->yld_count);
current->sched_class->yield_task(rq);
/*
@@ -4863,6 +4922,7 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
+#ifndef CONFIG_PREEMPT
int __sched _cond_resched(void)
{
if (should_resched(0)) {
@@ -4872,6 +4932,7 @@ int __sched _cond_resched(void)
return 0;
}
EXPORT_SYMBOL(_cond_resched);
+#endif
/*
* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -4927,7 +4988,7 @@ EXPORT_SYMBOL(__cond_resched_softirq);
* Typical broken usage is:
*
* while (!event)
- * yield();
+ * yield();
*
* where one assumes that yield() will let 'the other' process run that will
* make event true. If the current task is a SCHED_FIFO task that will never
@@ -4997,7 +5058,7 @@ again:
yielded = curr->sched_class->yield_to_task(rq, p, preempt);
if (yielded) {
- schedstat_inc(rq, yld_count);
+ schedstat_inc(rq->yld_count);
/*
* Make p's CPU reschedule; pick_next_entity takes care of
* fairness.
@@ -5018,31 +5079,48 @@ out_irq:
}
EXPORT_SYMBOL_GPL(yield_to);
+int io_schedule_prepare(void)
+{
+ int old_iowait = current->in_iowait;
+
+ current->in_iowait = 1;
+ blk_schedule_flush_plug(current);
+
+ return old_iowait;
+}
+
+void io_schedule_finish(int token)
+{
+ current->in_iowait = token;
+}
+
/*
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
* that process accounting knows that this is a task in IO wait state.
*/
long __sched io_schedule_timeout(long timeout)
{
- int old_iowait = current->in_iowait;
- struct rq *rq;
+ int token;
long ret;
- current->in_iowait = 1;
- blk_schedule_flush_plug(current);
-
- delayacct_blkio_start();
- rq = raw_rq();
- atomic_inc(&rq->nr_iowait);
+ token = io_schedule_prepare();
ret = schedule_timeout(timeout);
- current->in_iowait = old_iowait;
- atomic_dec(&rq->nr_iowait);
- delayacct_blkio_end();
+ io_schedule_finish(token);
return ret;
}
EXPORT_SYMBOL(io_schedule_timeout);
+void io_schedule(void)
+{
+ int token;
+
+ token = io_schedule_prepare();
+ schedule();
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL(io_schedule);
+
/**
* sys_sched_get_priority_max - return maximum RT priority.
* @policy: scheduling class.
@@ -5154,21 +5232,14 @@ void sched_show_task(struct task_struct *p)
int ppid;
unsigned long state = p->state;
+ if (!try_get_task_stack(p))
+ return;
if (state)
state = __ffs(state) + 1;
printk(KERN_INFO "%-15.15s %c", p->comm,
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-#if BITS_PER_LONG == 32
- if (state == TASK_RUNNING)
- printk(KERN_CONT " running ");
- else
- printk(KERN_CONT " %08lx ", thread_saved_pc(p));
-#else
if (state == TASK_RUNNING)
printk(KERN_CONT " running task ");
- else
- printk(KERN_CONT " %016lx ", thread_saved_pc(p));
-#endif
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
@@ -5183,6 +5254,7 @@ void sched_show_task(struct task_struct *p)
print_worker_info(KERN_INFO, p);
show_stack(p, NULL);
+ put_task_stack(p);
}
void show_state_filter(unsigned long state_filter)
@@ -5231,7 +5303,7 @@ void init_idle_bootup_task(struct task_struct *idle)
/**
* init_idle - set up an idle thread for a given CPU
* @idle: task in question
- * @cpu: cpu the idle task belongs to
+ * @cpu: CPU the idle task belongs to
*
* NOTE: this function does not set the idle thread's NEED_RESCHED
* flag, to make booting more robust.
@@ -5247,6 +5319,7 @@ void init_idle(struct task_struct *idle, int cpu)
__sched_fork(0, idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
+ idle->flags |= PF_IDLE;
kasan_unpoison_task_stack(idle);
@@ -5261,7 +5334,7 @@ void init_idle(struct task_struct *idle, int cpu)
#endif
/*
* We're having a chicken and egg problem, even though we are
- * holding rq->lock, the cpu isn't yet set to this cpu so the
+ * holding rq->lock, the CPU isn't yet set to this CPU so the
* lockdep check in task_group() will fail.
*
* Similar case to sched_fork(). / Alternatively we could
@@ -5326,7 +5399,7 @@ int task_can_attach(struct task_struct *p,
/*
* Kthreads which disallow setaffinity shouldn't be moved
- * to a new cpuset; we don't want to change their cpu
+ * to a new cpuset; we don't want to change their CPU
* affinity and isolating such threads by their set of
* allowed nodes is unnecessary. Thus, cpusets are not
* applicable for such threads. This prevents checking for
@@ -5375,7 +5448,7 @@ out:
#ifdef CONFIG_SMP
-static bool sched_smp_initialized __read_mostly;
+bool sched_smp_initialized __read_mostly;
#ifdef CONFIG_NUMA_BALANCING
/* Migrate current task p to target_cpu */
@@ -5417,17 +5490,17 @@ void sched_setnuma(struct task_struct *p, int nid)
p->numa_preferred_nid = nid;
- if (running)
- p->sched_class->set_curr_task(rq);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE);
+ if (running)
+ set_curr_task(rq, p);
task_rq_unlock(rq, p, &rf);
}
#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Ensures that the idle task is using init_mm right before its cpu goes
+ * Ensure that the idle task is using init_mm right before its CPU goes
* offline.
*/
void idle_task_exit(void)
@@ -5487,7 +5560,7 @@ static void migrate_tasks(struct rq *dead_rq)
{
struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
- struct pin_cookie cookie;
+ struct rq_flags rf, old_rf;
int dest_cpu;
/*
@@ -5511,16 +5584,16 @@ static void migrate_tasks(struct rq *dead_rq)
for (;;) {
/*
* There's this thread running, bail when that's the only
- * remaining thread.
+ * remaining thread:
*/
if (rq->nr_running == 1)
break;
/*
- * pick_next_task assumes pinned rq->lock.
+ * pick_next_task() assumes pinned rq->lock:
*/
- cookie = lockdep_pin_lock(&rq->lock);
- next = pick_next_task(rq, &fake_task, cookie);
+ rq_pin_lock(rq, &rf);
+ next = pick_next_task(rq, &fake_task, &rf);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
@@ -5533,7 +5606,7 @@ static void migrate_tasks(struct rq *dead_rq)
* because !cpu_active at this point, which means load-balance
* will not interfere. Also, stop-machine.
*/
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, &rf);
raw_spin_unlock(&rq->lock);
raw_spin_lock(&next->pi_lock);
raw_spin_lock(&rq->lock);
@@ -5548,6 +5621,13 @@ static void migrate_tasks(struct rq *dead_rq)
continue;
}
+ /*
+ * __migrate_task() may return with a different
+ * rq->lock held and a new cookie in 'rf', but we need
+ * to preserve rf::clock_update_flags for 'dead_rq'.
+ */
+ old_rf = rf;
+
/* Find suitable destination for @next, with force if needed. */
dest_cpu = select_fallback_rq(dead_rq->cpu, next);
@@ -5556,6 +5636,7 @@ static void migrate_tasks(struct rq *dead_rq)
raw_spin_unlock(&rq->lock);
rq = dead_rq;
raw_spin_lock(&rq->lock);
+ rf = old_rf;
}
raw_spin_unlock(&next->pi_lock);
}
@@ -5564,7 +5645,7 @@ static void migrate_tasks(struct rq *dead_rq)
}
#endif /* CONFIG_HOTPLUG_CPU */
-static void set_rq_online(struct rq *rq)
+void set_rq_online(struct rq *rq)
{
if (!rq->online) {
const struct sched_class *class;
@@ -5579,7 +5660,7 @@ static void set_rq_online(struct rq *rq)
}
}
-static void set_rq_offline(struct rq *rq)
+void set_rq_offline(struct rq *rq)
{
if (rq->online) {
const struct sched_class *class;
@@ -5601,1560 +5682,10 @@ static void set_cpu_rq_start_time(unsigned int cpu)
rq->age_stamp = sched_clock_cpu(cpu);
}
-static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
-
-#ifdef CONFIG_SCHED_DEBUG
-
-static __read_mostly int sched_debug_enabled;
-
-static int __init sched_debug_setup(char *str)
-{
- sched_debug_enabled = 1;
-
- return 0;
-}
-early_param("sched_debug", sched_debug_setup);
-
-static inline bool sched_debug(void)
-{
- return sched_debug_enabled;
-}
-
-static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
- struct cpumask *groupmask)
-{
- struct sched_group *group = sd->groups;
-
- cpumask_clear(groupmask);
-
- printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
-
- if (!(sd->flags & SD_LOAD_BALANCE)) {
- printk("does not load-balance\n");
- if (sd->parent)
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
- " has parent");
- return -1;
- }
-
- printk(KERN_CONT "span %*pbl level %s\n",
- cpumask_pr_args(sched_domain_span(sd)), sd->name);
-
- if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- printk(KERN_ERR "ERROR: domain->span does not contain "
- "CPU%d\n", cpu);
- }
- if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
- printk(KERN_ERR "ERROR: domain->groups does not contain"
- " CPU%d\n", cpu);
- }
-
- printk(KERN_DEBUG "%*s groups:", level + 1, "");
- do {
- if (!group) {
- printk("\n");
- printk(KERN_ERR "ERROR: group is NULL\n");
- break;
- }
-
- if (!cpumask_weight(sched_group_cpus(group))) {
- printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: empty group\n");
- break;
- }
-
- if (!(sd->flags & SD_OVERLAP) &&
- cpumask_intersects(groupmask, sched_group_cpus(group))) {
- printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: repeated CPUs\n");
- break;
- }
-
- cpumask_or(groupmask, groupmask, sched_group_cpus(group));
-
- printk(KERN_CONT " %*pbl",
- cpumask_pr_args(sched_group_cpus(group)));
- if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
- printk(KERN_CONT " (cpu_capacity = %d)",
- group->sgc->capacity);
- }
-
- group = group->next;
- } while (group != sd->groups);
- printk(KERN_CONT "\n");
-
- if (!cpumask_equal(sched_domain_span(sd), groupmask))
- printk(KERN_ERR "ERROR: groups don't span domain->span\n");
-
- if (sd->parent &&
- !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
- printk(KERN_ERR "ERROR: parent span is not a superset "
- "of domain->span\n");
- return 0;
-}
-
-static void sched_domain_debug(struct sched_domain *sd, int cpu)
-{
- int level = 0;
-
- if (!sched_debug_enabled)
- return;
-
- if (!sd) {
- printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
- return;
- }
-
- printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
-
- for (;;) {
- if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
- break;
- level++;
- sd = sd->parent;
- if (!sd)
- break;
- }
-}
-#else /* !CONFIG_SCHED_DEBUG */
-# define sched_domain_debug(sd, cpu) do { } while (0)
-static inline bool sched_debug(void)
-{
- return false;
-}
-#endif /* CONFIG_SCHED_DEBUG */
-
-static int sd_degenerate(struct sched_domain *sd)
-{
- if (cpumask_weight(sched_domain_span(sd)) == 1)
- return 1;
-
- /* Following flags need at least 2 groups */
- if (sd->flags & (SD_LOAD_BALANCE |
- SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_SHARE_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_SHARE_POWERDOMAIN)) {
- if (sd->groups != sd->groups->next)
- return 0;
- }
-
- /* Following flags don't use groups */
- if (sd->flags & (SD_WAKE_AFFINE))
- return 0;
-
- return 1;
-}
-
-static int
-sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
-{
- unsigned long cflags = sd->flags, pflags = parent->flags;
-
- if (sd_degenerate(parent))
- return 1;
-
- if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
- return 0;
-
- /* Flags needing groups don't count if only 1 group in parent */
- if (parent->groups == parent->groups->next) {
- pflags &= ~(SD_LOAD_BALANCE |
- SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_SHARE_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_PREFER_SIBLING |
- SD_SHARE_POWERDOMAIN);
- if (nr_node_ids == 1)
- pflags &= ~SD_SERIALIZE;
- }
- if (~cflags & pflags)
- return 0;
-
- return 1;
-}
-
-static void free_rootdomain(struct rcu_head *rcu)
-{
- struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
-
- cpupri_cleanup(&rd->cpupri);
- cpudl_cleanup(&rd->cpudl);
- free_cpumask_var(rd->dlo_mask);
- free_cpumask_var(rd->rto_mask);
- free_cpumask_var(rd->online);
- free_cpumask_var(rd->span);
- kfree(rd);
-}
-
-static void rq_attach_root(struct rq *rq, struct root_domain *rd)
-{
- struct root_domain *old_rd = NULL;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- if (rq->rd) {
- old_rd = rq->rd;
-
- if (cpumask_test_cpu(rq->cpu, old_rd->online))
- set_rq_offline(rq);
-
- cpumask_clear_cpu(rq->cpu, old_rd->span);
-
- /*
- * If we dont want to free the old_rd yet then
- * set old_rd to NULL to skip the freeing later
- * in this function:
- */
- if (!atomic_dec_and_test(&old_rd->refcount))
- old_rd = NULL;
- }
-
- atomic_inc(&rd->refcount);
- rq->rd = rd;
-
- cpumask_set_cpu(rq->cpu, rd->span);
- if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
- set_rq_online(rq);
-
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-
- if (old_rd)
- call_rcu_sched(&old_rd->rcu, free_rootdomain);
-}
-
-static int init_rootdomain(struct root_domain *rd)
-{
- memset(rd, 0, sizeof(*rd));
-
- if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
- goto out;
- if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
- goto free_span;
- if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
- goto free_online;
- if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
- goto free_dlo_mask;
-
- init_dl_bw(&rd->dl_bw);
- if (cpudl_init(&rd->cpudl) != 0)
- goto free_dlo_mask;
-
- if (cpupri_init(&rd->cpupri) != 0)
- goto free_rto_mask;
- return 0;
-
-free_rto_mask:
- free_cpumask_var(rd->rto_mask);
-free_dlo_mask:
- free_cpumask_var(rd->dlo_mask);
-free_online:
- free_cpumask_var(rd->online);
-free_span:
- free_cpumask_var(rd->span);
-out:
- return -ENOMEM;
-}
-
-/*
- * By default the system creates a single root-domain with all cpus as
- * members (mimicking the global state we have today).
- */
-struct root_domain def_root_domain;
-
-static void init_defrootdomain(void)
-{
- init_rootdomain(&def_root_domain);
-
- atomic_set(&def_root_domain.refcount, 1);
-}
-
-static struct root_domain *alloc_rootdomain(void)
-{
- struct root_domain *rd;
-
- rd = kmalloc(sizeof(*rd), GFP_KERNEL);
- if (!rd)
- return NULL;
-
- if (init_rootdomain(rd) != 0) {
- kfree(rd);
- return NULL;
- }
-
- return rd;
-}
-
-static void free_sched_groups(struct sched_group *sg, int free_sgc)
-{
- struct sched_group *tmp, *first;
-
- if (!sg)
- return;
-
- first = sg;
- do {
- tmp = sg->next;
-
- if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
- kfree(sg->sgc);
-
- kfree(sg);
- sg = tmp;
- } while (sg != first);
-}
-
-static void free_sched_domain(struct rcu_head *rcu)
-{
- struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-
- /*
- * If its an overlapping domain it has private groups, iterate and
- * nuke them all.
- */
- if (sd->flags & SD_OVERLAP) {
- free_sched_groups(sd->groups, 1);
- } else if (atomic_dec_and_test(&sd->groups->ref)) {
- kfree(sd->groups->sgc);
- kfree(sd->groups);
- }
- kfree(sd);
-}
-
-static void destroy_sched_domain(struct sched_domain *sd, int cpu)
-{
- call_rcu(&sd->rcu, free_sched_domain);
-}
-
-static void destroy_sched_domains(struct sched_domain *sd, int cpu)
-{
- for (; sd; sd = sd->parent)
- destroy_sched_domain(sd, cpu);
-}
-
-/*
- * Keep a special pointer to the highest sched_domain that has
- * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
- * allows us to avoid some pointer chasing select_idle_sibling().
- *
- * Also keep a unique ID per domain (we use the first cpu number in
- * the cpumask of the domain), this allows us to quickly tell if
- * two cpus are in the same cache domain, see cpus_share_cache().
- */
-DEFINE_PER_CPU(struct sched_domain *, sd_llc);
-DEFINE_PER_CPU(int, sd_llc_size);
-DEFINE_PER_CPU(int, sd_llc_id);
-DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_busy);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym);
-
-static void update_top_cache_domain(int cpu)
-{
- struct sched_domain *sd;
- struct sched_domain *busy_sd = NULL;
- int id = cpu;
- int size = 1;
-
- sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd) {
- id = cpumask_first(sched_domain_span(sd));
- size = cpumask_weight(sched_domain_span(sd));
- busy_sd = sd->parent; /* sd_busy */
- }
- rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
-
- rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
- per_cpu(sd_llc_size, cpu) = size;
- per_cpu(sd_llc_id, cpu) = id;
-
- sd = lowest_flag_domain(cpu, SD_NUMA);
- rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
-
- sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
- rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
-}
-
-/*
- * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
- * hold the hotplug lock.
- */
-static void
-cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- struct sched_domain *tmp;
-
- /* Remove the sched domains which do not contribute to scheduling. */
- for (tmp = sd; tmp; ) {
- struct sched_domain *parent = tmp->parent;
- if (!parent)
- break;
-
- if (sd_parent_degenerate(tmp, parent)) {
- tmp->parent = parent->parent;
- if (parent->parent)
- parent->parent->child = tmp;
- /*
- * Transfer SD_PREFER_SIBLING down in case of a
- * degenerate parent; the spans match for this
- * so the property transfers.
- */
- if (parent->flags & SD_PREFER_SIBLING)
- tmp->flags |= SD_PREFER_SIBLING;
- destroy_sched_domain(parent, cpu);
- } else
- tmp = tmp->parent;
- }
-
- if (sd && sd_degenerate(sd)) {
- tmp = sd;
- sd = sd->parent;
- destroy_sched_domain(tmp, cpu);
- if (sd)
- sd->child = NULL;
- }
-
- sched_domain_debug(sd, cpu);
-
- rq_attach_root(rq, rd);
- tmp = rq->sd;
- rcu_assign_pointer(rq->sd, sd);
- destroy_sched_domains(tmp, cpu);
-
- update_top_cache_domain(cpu);
-}
-
-/* Setup the mask of cpus configured for isolated domains */
-static int __init isolated_cpu_setup(char *str)
-{
- int ret;
-
- alloc_bootmem_cpumask_var(&cpu_isolated_map);
- ret = cpulist_parse(str, cpu_isolated_map);
- if (ret) {
- pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
- return 0;
- }
- return 1;
-}
-__setup("isolcpus=", isolated_cpu_setup);
-
-struct s_data {
- struct sched_domain ** __percpu sd;
- struct root_domain *rd;
-};
-
-enum s_alloc {
- sa_rootdomain,
- sa_sd,
- sa_sd_storage,
- sa_none,
-};
-
-/*
- * Build an iteration mask that can exclude certain CPUs from the upwards
- * domain traversal.
- *
- * Asymmetric node setups can result in situations where the domain tree is of
- * unequal depth, make sure to skip domains that already cover the entire
- * range.
- *
- * In that case build_sched_domains() will have terminated the iteration early
- * and our sibling sd spans will be empty. Domains should always include the
- * cpu they're built on, so check that.
- *
- */
-static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
-{
- const struct cpumask *span = sched_domain_span(sd);
- struct sd_data *sdd = sd->private;
- struct sched_domain *sibling;
- int i;
-
- for_each_cpu(i, span) {
- sibling = *per_cpu_ptr(sdd->sd, i);
- if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
- continue;
-
- cpumask_set_cpu(i, sched_group_mask(sg));
- }
-}
-
-/*
- * Return the canonical balance cpu for this group, this is the first cpu
- * of this group that's also in the iteration mask.
- */
-int group_balance_cpu(struct sched_group *sg)
-{
- return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
-}
-
-static int
-build_overlap_sched_groups(struct sched_domain *sd, int cpu)
-{
- struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
- const struct cpumask *span = sched_domain_span(sd);
- struct cpumask *covered = sched_domains_tmpmask;
- struct sd_data *sdd = sd->private;
- struct sched_domain *sibling;
- int i;
-
- cpumask_clear(covered);
-
- for_each_cpu(i, span) {
- struct cpumask *sg_span;
-
- if (cpumask_test_cpu(i, covered))
- continue;
-
- sibling = *per_cpu_ptr(sdd->sd, i);
-
- /* See the comment near build_group_mask(). */
- if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
- continue;
-
- sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(cpu));
-
- if (!sg)
- goto fail;
-
- sg_span = sched_group_cpus(sg);
- if (sibling->child)
- cpumask_copy(sg_span, sched_domain_span(sibling->child));
- else
- cpumask_set_cpu(i, sg_span);
-
- cpumask_or(covered, covered, sg_span);
-
- sg->sgc = *per_cpu_ptr(sdd->sgc, i);
- if (atomic_inc_return(&sg->sgc->ref) == 1)
- build_group_mask(sd, sg);
-
- /*
- * Initialize sgc->capacity such that even if we mess up the
- * domains and no possible iteration will get us here, we won't
- * die on a /0 trap.
- */
- sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
-
- /*
- * Make sure the first group of this domain contains the
- * canonical balance cpu. Otherwise the sched_domain iteration
- * breaks. See update_sg_lb_stats().
- */
- if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
- group_balance_cpu(sg) == cpu)
- groups = sg;
-
- if (!first)
- first = sg;
- if (last)
- last->next = sg;
- last = sg;
- last->next = first;
- }
- sd->groups = groups;
-
- return 0;
-
-fail:
- free_sched_groups(first, 0);
-
- return -ENOMEM;
-}
-
-static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
-{
- struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
- struct sched_domain *child = sd->child;
-
- if (child)
- cpu = cpumask_first(sched_domain_span(child));
-
- if (sg) {
- *sg = *per_cpu_ptr(sdd->sg, cpu);
- (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
- atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
- }
-
- return cpu;
-}
-
-/*
- * build_sched_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_capacity to 0.
- *
- * Assumes the sched_domain tree is fully constructed
- */
-static int
-build_sched_groups(struct sched_domain *sd, int cpu)
-{
- struct sched_group *first = NULL, *last = NULL;
- struct sd_data *sdd = sd->private;
- const struct cpumask *span = sched_domain_span(sd);
- struct cpumask *covered;
- int i;
-
- get_group(cpu, sdd, &sd->groups);
- atomic_inc(&sd->groups->ref);
-
- if (cpu != cpumask_first(span))
- return 0;
-
- lockdep_assert_held(&sched_domains_mutex);
- covered = sched_domains_tmpmask;
-
- cpumask_clear(covered);
-
- for_each_cpu(i, span) {
- struct sched_group *sg;
- int group, j;
-
- if (cpumask_test_cpu(i, covered))
- continue;
-
- group = get_group(i, sdd, &sg);
- cpumask_setall(sched_group_mask(sg));
-
- for_each_cpu(j, span) {
- if (get_group(j, sdd, NULL) != group)
- continue;
-
- cpumask_set_cpu(j, covered);
- cpumask_set_cpu(j, sched_group_cpus(sg));
- }
-
- if (!first)
- first = sg;
- if (last)
- last->next = sg;
- last = sg;
- }
- last->next = first;
-
- return 0;
-}
-
-/*
- * Initialize sched groups cpu_capacity.
- *
- * cpu_capacity indicates the capacity of sched group, which is used while
- * distributing the load between different sched groups in a sched domain.
- * Typically cpu_capacity for all the groups in a sched domain will be same
- * unless there are asymmetries in the topology. If there are asymmetries,
- * group having more cpu_capacity will pickup more load compared to the
- * group having less cpu_capacity.
- */
-static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
-{
- struct sched_group *sg = sd->groups;
-
- WARN_ON(!sg);
-
- do {
- sg->group_weight = cpumask_weight(sched_group_cpus(sg));
- sg = sg->next;
- } while (sg != sd->groups);
-
- if (cpu != group_balance_cpu(sg))
- return;
-
- update_group_capacity(sd, cpu);
- atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
-}
-
-/*
- * Initializers for schedule domains
- * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
- */
-
-static int default_relax_domain_level = -1;
-int sched_domain_level_max;
-
-static int __init setup_relax_domain_level(char *str)
-{
- if (kstrtoint(str, 0, &default_relax_domain_level))
- pr_warn("Unable to set relax_domain_level\n");
-
- return 1;
-}
-__setup("relax_domain_level=", setup_relax_domain_level);
-
-static void set_domain_attribute(struct sched_domain *sd,
- struct sched_domain_attr *attr)
-{
- int request;
-
- if (!attr || attr->relax_domain_level < 0) {
- if (default_relax_domain_level < 0)
- return;
- else
- request = default_relax_domain_level;
- } else
- request = attr->relax_domain_level;
- if (request < sd->level) {
- /* turn off idle balance on this domain */
- sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
- } else {
- /* turn on idle balance on this domain */
- sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
- }
-}
-
-static void __sdt_free(const struct cpumask *cpu_map);
-static int __sdt_alloc(const struct cpumask *cpu_map);
-
-static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
- const struct cpumask *cpu_map)
-{
- switch (what) {
- case sa_rootdomain:
- if (!atomic_read(&d->rd->refcount))
- free_rootdomain(&d->rd->rcu); /* fall through */
- case sa_sd:
- free_percpu(d->sd); /* fall through */
- case sa_sd_storage:
- __sdt_free(cpu_map); /* fall through */
- case sa_none:
- break;
- }
-}
-
-static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
- const struct cpumask *cpu_map)
-{
- memset(d, 0, sizeof(*d));
-
- if (__sdt_alloc(cpu_map))
- return sa_sd_storage;
- d->sd = alloc_percpu(struct sched_domain *);
- if (!d->sd)
- return sa_sd_storage;
- d->rd = alloc_rootdomain();
- if (!d->rd)
- return sa_sd;
- return sa_rootdomain;
-}
-
-/*
- * NULL the sd_data elements we've used to build the sched_domain and
- * sched_group structure so that the subsequent __free_domain_allocs()
- * will not free the data we're using.
- */
-static void claim_allocations(int cpu, struct sched_domain *sd)
-{
- struct sd_data *sdd = sd->private;
-
- WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
- *per_cpu_ptr(sdd->sd, cpu) = NULL;
-
- if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
- *per_cpu_ptr(sdd->sg, cpu) = NULL;
-
- if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
- *per_cpu_ptr(sdd->sgc, cpu) = NULL;
-}
-
-#ifdef CONFIG_NUMA
-static int sched_domains_numa_levels;
-enum numa_topology_type sched_numa_topology_type;
-static int *sched_domains_numa_distance;
-int sched_max_numa_distance;
-static struct cpumask ***sched_domains_numa_masks;
-static int sched_domains_curr_level;
-#endif
-
-/*
- * SD_flags allowed in topology descriptions.
- *
- * SD_SHARE_CPUCAPACITY - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN - describes shared power domain
- *
- * Odd one out:
- * SD_ASYM_PACKING - describes SMT quirks
- */
-#define TOPOLOGY_SD_FLAGS \
- (SD_SHARE_CPUCAPACITY | \
- SD_SHARE_PKG_RESOURCES | \
- SD_NUMA | \
- SD_ASYM_PACKING | \
- SD_SHARE_POWERDOMAIN)
-
-static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
-{
- struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
- int sd_weight, sd_flags = 0;
-
-#ifdef CONFIG_NUMA
- /*
- * Ugly hack to pass state to sd_numa_mask()...
- */
- sched_domains_curr_level = tl->numa_level;
-#endif
-
- sd_weight = cpumask_weight(tl->mask(cpu));
-
- if (tl->sd_flags)
- sd_flags = (*tl->sd_flags)();
- if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
- "wrong sd_flags in topology description\n"))
- sd_flags &= ~TOPOLOGY_SD_FLAGS;
-
- *sd = (struct sched_domain){
- .min_interval = sd_weight,
- .max_interval = 2*sd_weight,
- .busy_factor = 32,
- .imbalance_pct = 125,
-
- .cache_nice_tries = 0,
- .busy_idx = 0,
- .idle_idx = 0,
- .newidle_idx = 0,
- .wake_idx = 0,
- .forkexec_idx = 0,
-
- .flags = 1*SD_LOAD_BALANCE
- | 1*SD_BALANCE_NEWIDLE
- | 1*SD_BALANCE_EXEC
- | 1*SD_BALANCE_FORK
- | 0*SD_BALANCE_WAKE
- | 1*SD_WAKE_AFFINE
- | 0*SD_SHARE_CPUCAPACITY
- | 0*SD_SHARE_PKG_RESOURCES
- | 0*SD_SERIALIZE
- | 0*SD_PREFER_SIBLING
- | 0*SD_NUMA
- | sd_flags
- ,
-
- .last_balance = jiffies,
- .balance_interval = sd_weight,
- .smt_gain = 0,
- .max_newidle_lb_cost = 0,
- .next_decay_max_lb_cost = jiffies,
-#ifdef CONFIG_SCHED_DEBUG
- .name = tl->name,
-#endif
- };
-
- /*
- * Convert topological properties into behaviour.
- */
-
- if (sd->flags & SD_SHARE_CPUCAPACITY) {
- sd->flags |= SD_PREFER_SIBLING;
- sd->imbalance_pct = 110;
- sd->smt_gain = 1178; /* ~15% */
-
- } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
- sd->imbalance_pct = 117;
- sd->cache_nice_tries = 1;
- sd->busy_idx = 2;
-
-#ifdef CONFIG_NUMA
- } else if (sd->flags & SD_NUMA) {
- sd->cache_nice_tries = 2;
- sd->busy_idx = 3;
- sd->idle_idx = 2;
-
- sd->flags |= SD_SERIALIZE;
- if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
- sd->flags &= ~(SD_BALANCE_EXEC |
- SD_BALANCE_FORK |
- SD_WAKE_AFFINE);
- }
-
-#endif
- } else {
- sd->flags |= SD_PREFER_SIBLING;
- sd->cache_nice_tries = 1;
- sd->busy_idx = 2;
- sd->idle_idx = 1;
- }
-
- sd->private = &tl->data;
-
- return sd;
-}
-
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
-#endif
- { cpu_cpu_mask, SD_INIT_NAME(DIE) },
- { NULL, },
-};
-
-static struct sched_domain_topology_level *sched_domain_topology =
- default_topology;
-
-#define for_each_sd_topology(tl) \
- for (tl = sched_domain_topology; tl->mask; tl++)
-
-void set_sched_topology(struct sched_domain_topology_level *tl)
-{
- sched_domain_topology = tl;
-}
-
-#ifdef CONFIG_NUMA
-
-static const struct cpumask *sd_numa_mask(int cpu)
-{
- return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
-}
-
-static void sched_numa_warn(const char *str)
-{
- static int done = false;
- int i,j;
-
- if (done)
- return;
-
- done = true;
-
- printk(KERN_WARNING "ERROR: %s\n\n", str);
-
- for (i = 0; i < nr_node_ids; i++) {
- printk(KERN_WARNING " ");
- for (j = 0; j < nr_node_ids; j++)
- printk(KERN_CONT "%02d ", node_distance(i,j));
- printk(KERN_CONT "\n");
- }
- printk(KERN_WARNING "\n");
-}
-
-bool find_numa_distance(int distance)
-{
- int i;
-
- if (distance == node_distance(0, 0))
- return true;
-
- for (i = 0; i < sched_domains_numa_levels; i++) {
- if (sched_domains_numa_distance[i] == distance)
- return true;
- }
-
- return false;
-}
-
-/*
- * A system can have three types of NUMA topology:
- * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
- * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
- * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
- *
- * The difference between a glueless mesh topology and a backplane
- * topology lies in whether communication between not directly
- * connected nodes goes through intermediary nodes (where programs
- * could run), or through backplane controllers. This affects
- * placement of programs.
- *
- * The type of topology can be discerned with the following tests:
- * - If the maximum distance between any nodes is 1 hop, the system
- * is directly connected.
- * - If for two nodes A and B, located N > 1 hops away from each other,
- * there is an intermediary node C, which is < N hops away from both
- * nodes A and B, the system is a glueless mesh.
- */
-static void init_numa_topology_type(void)
-{
- int a, b, c, n;
-
- n = sched_max_numa_distance;
-
- if (sched_domains_numa_levels <= 1) {
- sched_numa_topology_type = NUMA_DIRECT;
- return;
- }
-
- for_each_online_node(a) {
- for_each_online_node(b) {
- /* Find two nodes furthest removed from each other. */
- if (node_distance(a, b) < n)
- continue;
-
- /* Is there an intermediary node between a and b? */
- for_each_online_node(c) {
- if (node_distance(a, c) < n &&
- node_distance(b, c) < n) {
- sched_numa_topology_type =
- NUMA_GLUELESS_MESH;
- return;
- }
- }
-
- sched_numa_topology_type = NUMA_BACKPLANE;
- return;
- }
- }
-}
-
-static void sched_init_numa(void)
-{
- int next_distance, curr_distance = node_distance(0, 0);
- struct sched_domain_topology_level *tl;
- int level = 0;
- int i, j, k;
-
- sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
- if (!sched_domains_numa_distance)
- return;
-
- /*
- * O(nr_nodes^2) deduplicating selection sort -- in order to find the
- * unique distances in the node_distance() table.
- *
- * Assumes node_distance(0,j) includes all distances in
- * node_distance(i,j) in order to avoid cubic time.
- */
- next_distance = curr_distance;
- for (i = 0; i < nr_node_ids; i++) {
- for (j = 0; j < nr_node_ids; j++) {
- for (k = 0; k < nr_node_ids; k++) {
- int distance = node_distance(i, k);
-
- if (distance > curr_distance &&
- (distance < next_distance ||
- next_distance == curr_distance))
- next_distance = distance;
-
- /*
- * While not a strong assumption it would be nice to know
- * about cases where if node A is connected to B, B is not
- * equally connected to A.
- */
- if (sched_debug() && node_distance(k, i) != distance)
- sched_numa_warn("Node-distance not symmetric");
-
- if (sched_debug() && i && !find_numa_distance(distance))
- sched_numa_warn("Node-0 not representative");
- }
- if (next_distance != curr_distance) {
- sched_domains_numa_distance[level++] = next_distance;
- sched_domains_numa_levels = level;
- curr_distance = next_distance;
- } else break;
- }
-
- /*
- * In case of sched_debug() we verify the above assumption.
- */
- if (!sched_debug())
- break;
- }
-
- if (!level)
- return;
-
- /*
- * 'level' contains the number of unique distances, excluding the
- * identity distance node_distance(i,i).
- *
- * The sched_domains_numa_distance[] array includes the actual distance
- * numbers.
- */
-
- /*
- * Here, we should temporarily reset sched_domains_numa_levels to 0.
- * If it fails to allocate memory for array sched_domains_numa_masks[][],
- * the array will contain less then 'level' members. This could be
- * dangerous when we use it to iterate array sched_domains_numa_masks[][]
- * in other functions.
- *
- * We reset it to 'level' at the end of this function.
- */
- sched_domains_numa_levels = 0;
-
- sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
- if (!sched_domains_numa_masks)
- return;
-
- /*
- * Now for each level, construct a mask per node which contains all
- * cpus of nodes that are that many hops away from us.
- */
- for (i = 0; i < level; i++) {
- sched_domains_numa_masks[i] =
- kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
- if (!sched_domains_numa_masks[i])
- return;
-
- for (j = 0; j < nr_node_ids; j++) {
- struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
- if (!mask)
- return;
-
- sched_domains_numa_masks[i][j] = mask;
-
- for_each_node(k) {
- if (node_distance(j, k) > sched_domains_numa_distance[i])
- continue;
-
- cpumask_or(mask, mask, cpumask_of_node(k));
- }
- }
- }
-
- /* Compute default topology size */
- for (i = 0; sched_domain_topology[i].mask; i++);
-
- tl = kzalloc((i + level + 1) *
- sizeof(struct sched_domain_topology_level), GFP_KERNEL);
- if (!tl)
- return;
-
- /*
- * Copy the default topology bits..
- */
- for (i = 0; sched_domain_topology[i].mask; i++)
- tl[i] = sched_domain_topology[i];
-
- /*
- * .. and append 'j' levels of NUMA goodness.
- */
- for (j = 0; j < level; i++, j++) {
- tl[i] = (struct sched_domain_topology_level){
- .mask = sd_numa_mask,
- .sd_flags = cpu_numa_flags,
- .flags = SDTL_OVERLAP,
- .numa_level = j,
- SD_INIT_NAME(NUMA)
- };
- }
-
- sched_domain_topology = tl;
-
- sched_domains_numa_levels = level;
- sched_max_numa_distance = sched_domains_numa_distance[level - 1];
-
- init_numa_topology_type();
-}
-
-static void sched_domains_numa_masks_set(unsigned int cpu)
-{
- int node = cpu_to_node(cpu);
- int i, j;
-
- for (i = 0; i < sched_domains_numa_levels; i++) {
- for (j = 0; j < nr_node_ids; j++) {
- if (node_distance(j, node) <= sched_domains_numa_distance[i])
- cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
- }
- }
-}
-
-static void sched_domains_numa_masks_clear(unsigned int cpu)
-{
- int i, j;
-
- for (i = 0; i < sched_domains_numa_levels; i++) {
- for (j = 0; j < nr_node_ids; j++)
- cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
- }
-}
-
-#else
-static inline void sched_init_numa(void) { }
-static void sched_domains_numa_masks_set(unsigned int cpu) { }
-static void sched_domains_numa_masks_clear(unsigned int cpu) { }
-#endif /* CONFIG_NUMA */
-
-static int __sdt_alloc(const struct cpumask *cpu_map)
-{
- struct sched_domain_topology_level *tl;
- int j;
-
- for_each_sd_topology(tl) {
- struct sd_data *sdd = &tl->data;
-
- sdd->sd = alloc_percpu(struct sched_domain *);
- if (!sdd->sd)
- return -ENOMEM;
-
- sdd->sg = alloc_percpu(struct sched_group *);
- if (!sdd->sg)
- return -ENOMEM;
-
- sdd->sgc = alloc_percpu(struct sched_group_capacity *);
- if (!sdd->sgc)
- return -ENOMEM;
-
- for_each_cpu(j, cpu_map) {
- struct sched_domain *sd;
- struct sched_group *sg;
- struct sched_group_capacity *sgc;
-
- sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(j));
- if (!sd)
- return -ENOMEM;
-
- *per_cpu_ptr(sdd->sd, j) = sd;
-
- sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(j));
- if (!sg)
- return -ENOMEM;
-
- sg->next = sg;
-
- *per_cpu_ptr(sdd->sg, j) = sg;
-
- sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(j));
- if (!sgc)
- return -ENOMEM;
-
- *per_cpu_ptr(sdd->sgc, j) = sgc;
- }
- }
-
- return 0;
-}
-
-static void __sdt_free(const struct cpumask *cpu_map)
-{
- struct sched_domain_topology_level *tl;
- int j;
-
- for_each_sd_topology(tl) {
- struct sd_data *sdd = &tl->data;
-
- for_each_cpu(j, cpu_map) {
- struct sched_domain *sd;
-
- if (sdd->sd) {
- sd = *per_cpu_ptr(sdd->sd, j);
- if (sd && (sd->flags & SD_OVERLAP))
- free_sched_groups(sd->groups, 0);
- kfree(*per_cpu_ptr(sdd->sd, j));
- }
-
- if (sdd->sg)
- kfree(*per_cpu_ptr(sdd->sg, j));
- if (sdd->sgc)
- kfree(*per_cpu_ptr(sdd->sgc, j));
- }
- free_percpu(sdd->sd);
- sdd->sd = NULL;
- free_percpu(sdd->sg);
- sdd->sg = NULL;
- free_percpu(sdd->sgc);
- sdd->sgc = NULL;
- }
-}
-
-struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *child, int cpu)
-{
- struct sched_domain *sd = sd_init(tl, cpu);
- if (!sd)
- return child;
-
- cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
- if (child) {
- sd->level = child->level + 1;
- sched_domain_level_max = max(sched_domain_level_max, sd->level);
- child->parent = sd;
- sd->child = child;
-
- if (!cpumask_subset(sched_domain_span(child),
- sched_domain_span(sd))) {
- pr_err("BUG: arch topology borken\n");
-#ifdef CONFIG_SCHED_DEBUG
- pr_err(" the %s domain not a subset of the %s domain\n",
- child->name, sd->name);
-#endif
- /* Fixup, ensure @sd has at least @child cpus. */
- cpumask_or(sched_domain_span(sd),
- sched_domain_span(sd),
- sched_domain_span(child));
- }
-
- }
- set_domain_attribute(sd, attr);
-
- return sd;
-}
-
-/*
- * Build sched domains for a given set of cpus and attach the sched domains
- * to the individual cpus
- */
-static int build_sched_domains(const struct cpumask *cpu_map,
- struct sched_domain_attr *attr)
-{
- enum s_alloc alloc_state;
- struct sched_domain *sd;
- struct s_data d;
- int i, ret = -ENOMEM;
-
- alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
- if (alloc_state != sa_rootdomain)
- goto error;
-
- /* Set up domains for cpus specified by the cpu_map. */
- for_each_cpu(i, cpu_map) {
- struct sched_domain_topology_level *tl;
-
- sd = NULL;
- for_each_sd_topology(tl) {
- sd = build_sched_domain(tl, cpu_map, attr, sd, i);
- if (tl == sched_domain_topology)
- *per_cpu_ptr(d.sd, i) = sd;
- if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
- sd->flags |= SD_OVERLAP;
- if (cpumask_equal(cpu_map, sched_domain_span(sd)))
- break;
- }
- }
-
- /* Build the groups for the domains */
- for_each_cpu(i, cpu_map) {
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
- sd->span_weight = cpumask_weight(sched_domain_span(sd));
- if (sd->flags & SD_OVERLAP) {
- if (build_overlap_sched_groups(sd, i))
- goto error;
- } else {
- if (build_sched_groups(sd, i))
- goto error;
- }
- }
- }
-
- /* Calculate CPU capacity for physical packages and nodes */
- for (i = nr_cpumask_bits-1; i >= 0; i--) {
- if (!cpumask_test_cpu(i, cpu_map))
- continue;
-
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
- claim_allocations(i, sd);
- init_sched_groups_capacity(i, sd);
- }
- }
-
- /* Attach the domains */
- rcu_read_lock();
- for_each_cpu(i, cpu_map) {
- sd = *per_cpu_ptr(d.sd, i);
- cpu_attach_domain(sd, d.rd, i);
- }
- rcu_read_unlock();
-
- ret = 0;
-error:
- __free_domain_allocs(&d, alloc_state, cpu_map);
- return ret;
-}
-
-static cpumask_var_t *doms_cur; /* current sched domains */
-static int ndoms_cur; /* number of sched domains in 'doms_cur' */
-static struct sched_domain_attr *dattr_cur;
- /* attribues of custom domains in 'doms_cur' */
-
-/*
- * Special case: If a kmalloc of a doms_cur partition (array of
- * cpumask) fails, then fallback to a single sched domain,
- * as determined by the single cpumask fallback_doms.
- */
-static cpumask_var_t fallback_doms;
-
-/*
- * arch_update_cpu_topology lets virtualized architectures update the
- * cpu core maps. It is supposed to return 1 if the topology changed
- * or 0 if it stayed the same.
- */
-int __weak arch_update_cpu_topology(void)
-{
- return 0;
-}
-
-cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
-{
- int i;
- cpumask_var_t *doms;
-
- doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
- if (!doms)
- return NULL;
- for (i = 0; i < ndoms; i++) {
- if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
- free_sched_domains(doms, i);
- return NULL;
- }
- }
- return doms;
-}
-
-void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
-{
- unsigned int i;
- for (i = 0; i < ndoms; i++)
- free_cpumask_var(doms[i]);
- kfree(doms);
-}
-
/*
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
- * For now this just excludes isolated cpus, but could be used to
- * exclude other special cases in the future.
+ * used to mark begin/end of suspend/resume:
*/
-static int init_sched_domains(const struct cpumask *cpu_map)
-{
- int err;
-
- arch_update_cpu_topology();
- ndoms_cur = 1;
- doms_cur = alloc_sched_domains(ndoms_cur);
- if (!doms_cur)
- doms_cur = &fallback_doms;
- cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
- err = build_sched_domains(doms_cur[0], NULL);
- register_sched_domain_sysctl();
-
- return err;
-}
-
-/*
- * Detach sched domains from a group of cpus specified in cpu_map
- * These cpus will now be attached to the NULL domain
- */
-static void detach_destroy_domains(const struct cpumask *cpu_map)
-{
- int i;
-
- rcu_read_lock();
- for_each_cpu(i, cpu_map)
- cpu_attach_domain(NULL, &def_root_domain, i);
- rcu_read_unlock();
-}
-
-/* handle null as "default" */
-static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
- struct sched_domain_attr *new, int idx_new)
-{
- struct sched_domain_attr tmp;
-
- /* fast path */
- if (!new && !cur)
- return 1;
-
- tmp = SD_ATTR_INIT;
- return !memcmp(cur ? (cur + idx_cur) : &tmp,
- new ? (new + idx_new) : &tmp,
- sizeof(struct sched_domain_attr));
-}
-
-/*
- * Partition sched domains as specified by the 'ndoms_new'
- * cpumasks in the array doms_new[] of cpumasks. This compares
- * doms_new[] to the current sched domain partitioning, doms_cur[].
- * It destroys each deleted domain and builds each new domain.
- *
- * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
- * The masks don't intersect (don't overlap.) We should setup one
- * sched domain for each mask. CPUs not in any of the cpumasks will
- * not be load balanced. If the same cpumask appears both in the
- * current 'doms_cur' domains and in the new 'doms_new', we can leave
- * it as it is.
- *
- * The passed in 'doms_new' should be allocated using
- * alloc_sched_domains. This routine takes ownership of it and will
- * free_sched_domains it when done with it. If the caller failed the
- * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
- * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms', it also forces the domains to be rebuilt.
- *
- * If doms_new == NULL it will be replaced with cpu_online_mask.
- * ndoms_new == 0 is a special case for destroying existing domains,
- * and it will not create the default domain.
- *
- * Call with hotplug lock held
- */
-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
-{
- int i, j, n;
- int new_topology;
-
- mutex_lock(&sched_domains_mutex);
-
- /* always unregister in case we don't destroy any domains */
- unregister_sched_domain_sysctl();
-
- /* Let architecture update cpu core mappings. */
- new_topology = arch_update_cpu_topology();
-
- n = doms_new ? ndoms_new : 0;
-
- /* Destroy deleted domains */
- for (i = 0; i < ndoms_cur; i++) {
- for (j = 0; j < n && !new_topology; j++) {
- if (cpumask_equal(doms_cur[i], doms_new[j])
- && dattrs_equal(dattr_cur, i, dattr_new, j))
- goto match1;
- }
- /* no match - a current sched domain not in new doms_new[] */
- detach_destroy_domains(doms_cur[i]);
-match1:
- ;
- }
-
- n = ndoms_cur;
- if (doms_new == NULL) {
- n = 0;
- doms_new = &fallback_doms;
- cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
- WARN_ON_ONCE(dattr_new);
- }
-
- /* Build new domains */
- for (i = 0; i < ndoms_new; i++) {
- for (j = 0; j < n && !new_topology; j++) {
- if (cpumask_equal(doms_new[i], doms_cur[j])
- && dattrs_equal(dattr_new, i, dattr_cur, j))
- goto match2;
- }
- /* no match - add a new doms_new */
- build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
-match2:
- ;
- }
-
- /* Remember the new sched domains */
- if (doms_cur != &fallback_doms)
- free_sched_domains(doms_cur, ndoms_cur);
- kfree(dattr_cur); /* kfree(NULL) is safe */
- doms_cur = doms_new;
- dattr_cur = dattr_new;
- ndoms_cur = ndoms_new;
-
- register_sched_domain_sysctl();
-
- mutex_unlock(&sched_domains_mutex);
-}
-
-static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
+static int num_cpus_frozen;
/*
* Update cpusets according to cpu_active mask. If cpusets are
@@ -7231,7 +5762,7 @@ int sched_cpu_activate(unsigned int cpu)
* Put the rq online, if not already. This happens:
*
* 1) In the early boot process, because we build the real domains
- * after all cpus have been brought up.
+ * after all CPUs have been brought up.
*
* 2) At runtime, if cpuset_cpu_active() fails to rebuild the
* domains.
@@ -7319,6 +5850,22 @@ int sched_cpu_dying(unsigned int cpu)
}
#endif
+#ifdef CONFIG_SCHED_SMT
+DEFINE_STATIC_KEY_FALSE(sched_smt_present);
+
+static void sched_init_smt(void)
+{
+ /*
+ * We've enumerated all CPUs and will assume that if any CPU
+ * has SMT siblings, CPU0 will too.
+ */
+ if (cpumask_weight(cpu_smt_mask(0)) > 1)
+ static_branch_enable(&sched_smt_present);
+}
+#else
+static inline void sched_init_smt(void) { }
+#endif
+
void __init sched_init_smp(void)
{
cpumask_var_t non_isolated_cpus;
@@ -7330,7 +5877,7 @@ void __init sched_init_smp(void)
/*
* There's no userspace yet to cause hotplug operations; hence all the
- * cpu masks are stable and all blatant races in the below code cannot
+ * CPU masks are stable and all blatant races in the below code cannot
* happen.
*/
mutex_lock(&sched_domains_mutex);
@@ -7348,6 +5895,10 @@ void __init sched_init_smp(void)
init_sched_rt_class();
init_sched_dl_class();
+
+ sched_init_smt();
+ sched_clock_init_late();
+
sched_smp_initialized = true;
}
@@ -7362,6 +5913,7 @@ early_initcall(migration_init);
void __init sched_init_smp(void)
{
sched_init_granularity();
+ sched_clock_init_late();
}
#endif /* CONFIG_SMP */
@@ -7385,12 +5937,31 @@ static struct kmem_cache *task_group_cache __read_mostly;
#endif
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
+DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
+
+#define WAIT_TABLE_BITS 8
+#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
+static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
+{
+ const int shift = BITS_PER_LONG == 32 ? 5 : 6;
+ unsigned long val = (unsigned long)word << shift | bit;
+
+ return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
+}
+EXPORT_SYMBOL(bit_waitqueue);
void __init sched_init(void)
{
int i, j;
unsigned long alloc_size = 0, ptr;
+ sched_clock_init();
+
+ for (i = 0; i < WAIT_TABLE_SIZE; i++)
+ init_waitqueue_head(bit_wait_table + i);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
#endif
@@ -7421,13 +5992,13 @@ void __init sched_init(void)
for_each_possible_cpu(i) {
per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
+ per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
+ cpumask_size(), GFP_KERNEL, cpu_to_node(i));
}
#endif /* CONFIG_CPUMASK_OFFSTACK */
- init_rt_bandwidth(&def_rt_bandwidth,
- global_rt_period(), global_rt_runtime());
- init_dl_bandwidth(&def_dl_bandwidth,
- global_rt_period(), global_rt_runtime());
+ init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
+ init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
#ifdef CONFIG_SMP
init_defrootdomain();
@@ -7461,19 +6032,20 @@ void __init sched_init(void)
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
/*
- * How much cpu bandwidth does root_task_group get?
+ * How much CPU bandwidth does root_task_group get?
*
* In case of task-groups formed thr' the cgroup filesystem, it
- * gets 100% of the cpu resources in the system. This overall
- * system cpu resource is divided among the tasks of
+ * gets 100% of the CPU resources in the system. This overall
+ * system CPU resource is divided among the tasks of
* root_task_group and its child task-groups in a fair manner,
* based on each entity's (task or task-group's) weight
* (se->load.weight).
*
* In other words, if root_task_group has 10 tasks of weight
* 1024) and two child groups A0 and A1 (of weight 1024 each),
- * then A0's share of the cpu resource is:
+ * then A0's share of the CPU resource is:
*
* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
*
@@ -7523,10 +6095,6 @@ void __init sched_init(void)
set_load_weight(&init_task);
-#ifdef CONFIG_PREEMPT_NOTIFIERS
- INIT_HLIST_HEAD(&init_task.preempt_notifiers);
-#endif
-
/*
* The boot idle thread does lazy MMU switching as well:
*/
@@ -7534,11 +6102,6 @@ void __init sched_init(void)
enter_lazy_tlb(&init_mm, current);
/*
- * During early bootup we pretend to be a normal task:
- */
- current->sched_class = &fair_sched_class;
-
- /*
* Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be,
* but because we are the idle thread, we just pick up running again
@@ -7591,9 +6154,14 @@ EXPORT_SYMBOL(__might_sleep);
void ___might_sleep(const char *file, int line, int preempt_offset)
{
- static unsigned long prev_jiffy; /* ratelimiting */
+ /* Ratelimiting timestamp: */
+ static unsigned long prev_jiffy;
+
+ unsigned long preempt_disable_ip;
+
+ /* WARN_ON_ONCE() by default, no rate limit required: */
+ rcu_sleep_check();
- rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
!is_idle_task(current)) ||
system_state != SYSTEM_RUNNING || oops_in_progress)
@@ -7602,6 +6170,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
return;
prev_jiffy = jiffies;
+ /* Save this before calling printk(), since that will clobber it: */
+ preempt_disable_ip = get_preempt_disable_ip(current);
+
printk(KERN_ERR
"BUG: sleeping function called from invalid context at %s:%d\n",
file, line);
@@ -7616,14 +6187,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
-#ifdef CONFIG_DEBUG_PREEMPT
- if (!preempt_count_equals(preempt_offset)) {
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+ && !preempt_count_equals(preempt_offset)) {
pr_err("Preemption disabled at:");
- print_ip_sym(current->preempt_disable_ip);
+ print_ip_sym(preempt_disable_ip);
pr_cont("\n");
}
-#endif
dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
EXPORT_SYMBOL(___might_sleep);
#endif
@@ -7644,12 +6215,10 @@ void normalize_rt_tasks(void)
if (p->flags & PF_KTHREAD)
continue;
- p->se.exec_start = 0;
-#ifdef CONFIG_SCHEDSTATS
- p->se.statistics.wait_start = 0;
- p->se.statistics.sleep_start = 0;
- p->se.statistics.block_start = 0;
-#endif
+ p->se.exec_start = 0;
+ schedstat_set(p->se.statistics.wait_start, 0);
+ schedstat_set(p->se.statistics.sleep_start, 0);
+ schedstat_set(p->se.statistics.block_start, 0);
if (!dl_task(p) && !rt_task(p)) {
/*
@@ -7680,7 +6249,7 @@ void normalize_rt_tasks(void)
*/
/**
- * curr_task - return the current task for a given cpu.
+ * curr_task - return the current task for a given CPU.
* @cpu: the processor in question.
*
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
@@ -7696,13 +6265,13 @@ struct task_struct *curr_task(int cpu)
#ifdef CONFIG_IA64
/**
- * set_curr_task - set the current task for a given cpu.
+ * set_curr_task - set the current task for a given CPU.
* @cpu: the processor in question.
* @p: the task pointer to set.
*
* Description: This function must only be used when non-maskable interrupts
* are serviced on a separate stack. It allows the architecture to switch the
- * notion of the current task on a cpu in a non-blocking manner. This function
+ * notion of the current task on a CPU in a non-blocking manner. This function
* must be called with all CPU's synchronized, and interrupts disabled, the
* and caller must save the original value of the current task (see
* curr_task() above) and restore that value before reenabling interrupts and
@@ -7710,7 +6279,7 @@ struct task_struct *curr_task(int cpu)
*
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
*/
-void set_curr_task(int cpu, struct task_struct *p)
+void ia64_set_curr_task(int cpu, struct task_struct *p)
{
cpu_curr(cpu) = p;
}
@@ -7758,7 +6327,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
spin_lock_irqsave(&task_group_lock, flags);
list_add_rcu(&tg->list, &task_groups);
- WARN_ON(!parent); /* root should already exist */
+ /* Root should already exist: */
+ WARN_ON(!parent);
tg->parent = parent;
INIT_LIST_HEAD(&tg->children);
@@ -7771,13 +6341,13 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
/* rcu callback to free various structures associated with a task group */
static void sched_free_group_rcu(struct rcu_head *rhp)
{
- /* now it should be safe to free those cfs_rqs */
+ /* Now it should be safe to free those cfs_rqs: */
sched_free_group(container_of(rhp, struct task_group, rcu));
}
void sched_destroy_group(struct task_group *tg)
{
- /* wait for possible concurrent references to cfs_rqs complete */
+ /* Wait for possible concurrent references to cfs_rqs complete: */
call_rcu(&tg->rcu, sched_free_group_rcu);
}
@@ -7785,7 +6355,7 @@ void sched_offline_group(struct task_group *tg)
{
unsigned long flags;
- /* end participation in shares distribution */
+ /* End participation in shares distribution: */
unregister_fair_sched_group(tg);
spin_lock_irqsave(&task_group_lock, flags);
@@ -7830,21 +6400,22 @@ void sched_move_task(struct task_struct *tsk)
struct rq *rq;
rq = task_rq_lock(tsk, &rf);
+ update_rq_clock(rq);
running = task_current(rq, tsk);
queued = task_on_rq_queued(tsk);
if (queued)
dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
- if (unlikely(running))
+ if (running)
put_prev_task(rq, tsk);
sched_change_group(tsk, TASK_MOVE_GROUP);
- if (unlikely(running))
- tsk->sched_class->set_curr_task(rq);
if (queued)
enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
+ if (running)
+ set_curr_task(rq, tsk);
task_rq_unlock(rq, tsk, &rf);
}
@@ -8213,11 +6784,14 @@ int sched_rr_handler(struct ctl_table *table, int write,
mutex_lock(&mutex);
ret = proc_dointvec(table, write, buffer, lenp, ppos);
- /* make sure that internally we keep jiffies */
- /* also, writing zero resets timeslice to default */
+ /*
+ * Make sure that internally we keep jiffies.
+ * Also, writing zero resets the timeslice to default:
+ */
if (!ret && write) {
- sched_rr_timeslice = sched_rr_timeslice <= 0 ?
- RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+ sched_rr_timeslice =
+ sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
+ msecs_to_jiffies(sysctl_sched_rr_timeslice);
}
mutex_unlock(&mutex);
return ret;
@@ -8278,6 +6852,7 @@ static void cpu_cgroup_fork(struct task_struct *task)
rq = task_rq_lock(task, &rf);
+ update_rq_clock(rq);
sched_change_group(task, TASK_SET_GROUP);
task_rq_unlock(rq, task, &rf);
@@ -8397,9 +6972,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
cfs_b->quota = quota;
__refill_cfs_bandwidth_runtime(cfs_b);
- /* restart the period timer (if active) to handle new period expiry */
+
+ /* Restart the period timer (if active) to handle new period expiry: */
if (runtime_enabled)
start_cfs_bandwidth(cfs_b);
+
raw_spin_unlock_irq(&cfs_b->lock);
for_each_online_cpu(i) {
@@ -8537,8 +7114,8 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
parent_quota = parent_b->hierarchical_quota;
/*
- * ensure max(child_quota) <= parent_quota, inherit when no
- * limit is set
+ * Ensure max(child_quota) <= parent_quota, inherit when no
+ * limit is set:
*/
if (quota == RUNTIME_INF)
quota = parent_quota;
@@ -8647,7 +7224,7 @@ static struct cftype cpu_files[] = {
.write_u64 = cpu_rt_period_write_uint,
},
#endif
- { } /* terminate */
+ { } /* Terminate */
};
struct cgroup_subsys cpu_cgrp_subsys = {
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index bc0b309c3f19..f95ab29a45d0 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v)
for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
seq_printf(sf, "%s %lld\n",
cpuacct_stat_desc[stat],
- cputime64_to_clock_t(val[stat]));
+ (long long)nsec_to_clock_t(val[stat]));
}
return 0;
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index d4184498c9f5..e73119013c53 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -31,56 +31,81 @@ static inline int right_child(int i)
return (i << 1) + 2;
}
-static void cpudl_exchange(struct cpudl *cp, int a, int b)
+static void cpudl_heapify_down(struct cpudl *cp, int idx)
{
- int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
+ int l, r, largest;
- swap(cp->elements[a].cpu, cp->elements[b].cpu);
- swap(cp->elements[a].dl , cp->elements[b].dl );
+ int orig_cpu = cp->elements[idx].cpu;
+ u64 orig_dl = cp->elements[idx].dl;
- swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
-}
-
-static void cpudl_heapify(struct cpudl *cp, int idx)
-{
- int l, r, largest;
+ if (left_child(idx) >= cp->size)
+ return;
/* adapted from lib/prio_heap.c */
while(1) {
+ u64 largest_dl;
l = left_child(idx);
r = right_child(idx);
largest = idx;
+ largest_dl = orig_dl;
- if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
- cp->elements[l].dl))
+ if ((l < cp->size) && dl_time_before(orig_dl,
+ cp->elements[l].dl)) {
largest = l;
- if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
- cp->elements[r].dl))
+ largest_dl = cp->elements[l].dl;
+ }
+ if ((r < cp->size) && dl_time_before(largest_dl,
+ cp->elements[r].dl))
largest = r;
+
if (largest == idx)
break;
- /* Push idx down the heap one level and bump one up */
- cpudl_exchange(cp, largest, idx);
+ /* pull largest child onto idx */
+ cp->elements[idx].cpu = cp->elements[largest].cpu;
+ cp->elements[idx].dl = cp->elements[largest].dl;
+ cp->elements[cp->elements[idx].cpu].idx = idx;
idx = largest;
}
+ /* actual push down of saved original values orig_* */
+ cp->elements[idx].cpu = orig_cpu;
+ cp->elements[idx].dl = orig_dl;
+ cp->elements[cp->elements[idx].cpu].idx = idx;
}
-static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
+static void cpudl_heapify_up(struct cpudl *cp, int idx)
{
- WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
+ int p;
- if (dl_time_before(new_dl, cp->elements[idx].dl)) {
- cp->elements[idx].dl = new_dl;
- cpudl_heapify(cp, idx);
- } else {
- cp->elements[idx].dl = new_dl;
- while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
- cp->elements[idx].dl)) {
- cpudl_exchange(cp, idx, parent(idx));
- idx = parent(idx);
- }
- }
+ int orig_cpu = cp->elements[idx].cpu;
+ u64 orig_dl = cp->elements[idx].dl;
+
+ if (idx == 0)
+ return;
+
+ do {
+ p = parent(idx);
+ if (dl_time_before(orig_dl, cp->elements[p].dl))
+ break;
+ /* pull parent onto idx */
+ cp->elements[idx].cpu = cp->elements[p].cpu;
+ cp->elements[idx].dl = cp->elements[p].dl;
+ cp->elements[cp->elements[idx].cpu].idx = idx;
+ idx = p;
+ } while (idx != 0);
+ /* actual push up of saved original values orig_* */
+ cp->elements[idx].cpu = orig_cpu;
+ cp->elements[idx].dl = orig_dl;
+ cp->elements[cp->elements[idx].cpu].idx = idx;
+}
+
+static void cpudl_heapify(struct cpudl *cp, int idx)
+{
+ if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
+ cp->elements[idx].dl))
+ cpudl_heapify_up(cp, idx);
+ else
+ cpudl_heapify_down(cp, idx);
}
static inline int cpudl_maximum(struct cpudl *cp)
@@ -120,16 +145,15 @@ out:
}
/*
- * cpudl_set - update the cpudl max-heap
+ * cpudl_clear - remove a cpu from the cpudl max-heap
* @cp: the cpudl max-heap context
* @cpu: the target cpu
- * @dl: the new earliest deadline for this cpu
*
* Notes: assumes cpu_rq(cpu)->lock is locked
*
* Returns: (void)
*/
-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
+void cpudl_clear(struct cpudl *cp, int cpu)
{
int old_idx, new_cpu;
unsigned long flags;
@@ -137,47 +161,60 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
WARN_ON(!cpu_present(cpu));
raw_spin_lock_irqsave(&cp->lock, flags);
+
old_idx = cp->elements[cpu].idx;
- if (!is_valid) {
- /* remove item */
- if (old_idx == IDX_INVALID) {
- /*
- * Nothing to remove if old_idx was invalid.
- * This could happen if a rq_offline_dl is
- * called for a CPU without -dl tasks running.
- */
- goto out;
- }
+ if (old_idx == IDX_INVALID) {
+ /*
+ * Nothing to remove if old_idx was invalid.
+ * This could happen if a rq_offline_dl is
+ * called for a CPU without -dl tasks running.
+ */
+ } else {
new_cpu = cp->elements[cp->size - 1].cpu;
cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
cp->elements[old_idx].cpu = new_cpu;
cp->size--;
cp->elements[new_cpu].idx = old_idx;
cp->elements[cpu].idx = IDX_INVALID;
- while (old_idx > 0 && dl_time_before(
- cp->elements[parent(old_idx)].dl,
- cp->elements[old_idx].dl)) {
- cpudl_exchange(cp, old_idx, parent(old_idx));
- old_idx = parent(old_idx);
- }
- cpumask_set_cpu(cpu, cp->free_cpus);
- cpudl_heapify(cp, old_idx);
+ cpudl_heapify(cp, old_idx);
- goto out;
+ cpumask_set_cpu(cpu, cp->free_cpus);
}
+ raw_spin_unlock_irqrestore(&cp->lock, flags);
+}
+
+/*
+ * cpudl_set - update the cpudl max-heap
+ * @cp: the cpudl max-heap context
+ * @cpu: the target cpu
+ * @dl: the new earliest deadline for this cpu
+ *
+ * Notes: assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
+{
+ int old_idx;
+ unsigned long flags;
+ WARN_ON(!cpu_present(cpu));
+
+ raw_spin_lock_irqsave(&cp->lock, flags);
+
+ old_idx = cp->elements[cpu].idx;
if (old_idx == IDX_INVALID) {
- cp->size++;
- cp->elements[cp->size - 1].dl = dl;
- cp->elements[cp->size - 1].cpu = cpu;
- cp->elements[cpu].idx = cp->size - 1;
- cpudl_change_key(cp, cp->size - 1, dl);
+ int new_idx = cp->size++;
+ cp->elements[new_idx].dl = dl;
+ cp->elements[new_idx].cpu = cpu;
+ cp->elements[cpu].idx = new_idx;
+ cpudl_heapify_up(cp, new_idx);
cpumask_clear_cpu(cpu, cp->free_cpus);
} else {
- cpudl_change_key(cp, old_idx, dl);
+ cp->elements[old_idx].dl = dl;
+ cpudl_heapify(cp, old_idx);
}
-out:
raw_spin_unlock_irqrestore(&cp->lock, flags);
}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index fcbdf83fed7e..f7da8c55bba0 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -23,7 +23,8 @@ struct cpudl {
#ifdef CONFIG_SMP
int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask);
-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
+void cpudl_clear(struct cpudl *cp, int cpu);
int cpudl_init(struct cpudl *cp);
void cpudl_set_freecpu(struct cpudl *cp, int cpu);
void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 1141954e73b4..dbc51442ecbc 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
*/
void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
void (*func)(struct update_util_data *data, u64 time,
- unsigned long util, unsigned long max))
+ unsigned int flags))
{
if (WARN_ON(!data || !func))
return;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index a84641b222c1..fd4659313640 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -12,12 +12,14 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/cpufreq.h>
-#include <linux/module.h>
+#include <linux/kthread.h>
#include <linux/slab.h>
#include <trace/events/power.h>
#include "sched.h"
+#define SUGOV_KTHREAD_PRIORITY 50
+
struct sugov_tunables {
struct gov_attr_set attr_set;
unsigned int rate_limit_us;
@@ -36,8 +38,10 @@ struct sugov_policy {
/* The next fields are only needed if fast switch cannot be used. */
struct irq_work irq_work;
- struct work_struct work;
+ struct kthread_work work;
struct mutex work_lock;
+ struct kthread_worker worker;
+ struct task_struct *thread;
bool work_in_progress;
bool need_freq_update;
@@ -48,11 +52,14 @@ struct sugov_cpu {
struct sugov_policy *sg_policy;
unsigned int cached_raw_freq;
+ unsigned long iowait_boost;
+ unsigned long iowait_boost_max;
+ u64 last_update;
/* The fields below are only needed when sharing a policy. */
unsigned long util;
unsigned long max;
- u64 last_update;
+ unsigned int flags;
};
static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -144,24 +151,75 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
return cpufreq_driver_resolve_freq(policy, freq);
}
+static void sugov_get_util(unsigned long *util, unsigned long *max)
+{
+ struct rq *rq = this_rq();
+ unsigned long cfs_max;
+
+ cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+
+ *util = min(rq->cfs.avg.util_avg, cfs_max);
+ *max = cfs_max;
+}
+
+static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned int flags)
+{
+ if (flags & SCHED_CPUFREQ_IOWAIT) {
+ sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ } else if (sg_cpu->iowait_boost) {
+ s64 delta_ns = time - sg_cpu->last_update;
+
+ /* Clear iowait_boost if the CPU apprears to have been idle. */
+ if (delta_ns > TICK_NSEC)
+ sg_cpu->iowait_boost = 0;
+ }
+}
+
+static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
+ unsigned long *max)
+{
+ unsigned long boost_util = sg_cpu->iowait_boost;
+ unsigned long boost_max = sg_cpu->iowait_boost_max;
+
+ if (!boost_util)
+ return;
+
+ if (*util * boost_max < *max * boost_util) {
+ *util = boost_util;
+ *max = boost_max;
+ }
+ sg_cpu->iowait_boost >>= 1;
+}
+
static void sugov_update_single(struct update_util_data *hook, u64 time,
- unsigned long util, unsigned long max)
+ unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned long util, max;
unsigned int next_f;
+ sugov_set_iowait_boost(sg_cpu, time, flags);
+ sg_cpu->last_update = time;
+
if (!sugov_should_update_freq(sg_policy, time))
return;
- next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq :
- get_next_freq(sg_cpu, util, max);
+ if (flags & SCHED_CPUFREQ_RT_DL) {
+ next_f = policy->cpuinfo.max_freq;
+ } else {
+ sugov_get_util(&util, &max);
+ sugov_iowait_boost(sg_cpu, &util, &max);
+ next_f = get_next_freq(sg_cpu, util, max);
+ }
sugov_update_commit(sg_policy, time, next_f);
}
static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
- unsigned long util, unsigned long max)
+ unsigned long util, unsigned long max,
+ unsigned int flags)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
@@ -169,9 +227,11 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
u64 last_freq_update_time = sg_policy->last_freq_update_time;
unsigned int j;
- if (util == ULONG_MAX)
+ if (flags & SCHED_CPUFREQ_RT_DL)
return max_f;
+ sugov_iowait_boost(sg_cpu, &util, &max);
+
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu;
unsigned long j_util, j_max;
@@ -186,48 +246,57 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
* frequency update and the time elapsed between the last update
* of the CPU utilization and the last frequency update is long
* enough, don't take the CPU into account as it probably is
- * idle now.
+ * idle now (and clear iowait_boost for it).
*/
delta_ns = last_freq_update_time - j_sg_cpu->last_update;
- if (delta_ns > TICK_NSEC)
+ if (delta_ns > TICK_NSEC) {
+ j_sg_cpu->iowait_boost = 0;
continue;
-
- j_util = j_sg_cpu->util;
- if (j_util == ULONG_MAX)
+ }
+ if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
return max_f;
+ j_util = j_sg_cpu->util;
j_max = j_sg_cpu->max;
if (j_util * max > j_max * util) {
util = j_util;
max = j_max;
}
+
+ sugov_iowait_boost(j_sg_cpu, &util, &max);
}
return get_next_freq(sg_cpu, util, max);
}
static void sugov_update_shared(struct update_util_data *hook, u64 time,
- unsigned long util, unsigned long max)
+ unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ unsigned long util, max;
unsigned int next_f;
+ sugov_get_util(&util, &max);
+
raw_spin_lock(&sg_policy->update_lock);
sg_cpu->util = util;
sg_cpu->max = max;
+ sg_cpu->flags = flags;
+
+ sugov_set_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
if (sugov_should_update_freq(sg_policy, time)) {
- next_f = sugov_next_freq_shared(sg_cpu, util, max);
+ next_f = sugov_next_freq_shared(sg_cpu, util, max, flags);
sugov_update_commit(sg_policy, time, next_f);
}
raw_spin_unlock(&sg_policy->update_lock);
}
-static void sugov_work(struct work_struct *work)
+static void sugov_work(struct kthread_work *work)
{
struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
@@ -244,7 +313,21 @@ static void sugov_irq_work(struct irq_work *irq_work)
struct sugov_policy *sg_policy;
sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
- schedule_work_on(smp_processor_id(), &sg_policy->work);
+
+ /*
+ * For RT and deadline tasks, the schedutil governor shoots the
+ * frequency to maximum. Special care must be taken to ensure that this
+ * kthread doesn't result in the same behavior.
+ *
+ * This is (mostly) guaranteed by the work_in_progress flag. The flag is
+ * updated only at the end of the sugov_work() function and before that
+ * the schedutil governor rejects all other frequency scaling requests.
+ *
+ * There is a very rare case though, where the RT thread yields right
+ * after the work_in_progress flag is cleared. The effects of that are
+ * neglected for now.
+ */
+ kthread_queue_work(&sg_policy->worker, &sg_policy->work);
}
/************************** sysfs interface ************************/
@@ -307,19 +390,64 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
return NULL;
sg_policy->policy = policy;
- init_irq_work(&sg_policy->irq_work, sugov_irq_work);
- INIT_WORK(&sg_policy->work, sugov_work);
- mutex_init(&sg_policy->work_lock);
raw_spin_lock_init(&sg_policy->update_lock);
return sg_policy;
}
static void sugov_policy_free(struct sugov_policy *sg_policy)
{
- mutex_destroy(&sg_policy->work_lock);
kfree(sg_policy);
}
+static int sugov_kthread_create(struct sugov_policy *sg_policy)
+{
+ struct task_struct *thread;
+ struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+ struct cpufreq_policy *policy = sg_policy->policy;
+ int ret;
+
+ /* kthread only required for slow path */
+ if (policy->fast_switch_enabled)
+ return 0;
+
+ kthread_init_work(&sg_policy->work, sugov_work);
+ kthread_init_worker(&sg_policy->worker);
+ thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
+ "sugov:%d",
+ cpumask_first(policy->related_cpus));
+ if (IS_ERR(thread)) {
+ pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
+ return PTR_ERR(thread);
+ }
+
+ ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
+ if (ret) {
+ kthread_stop(thread);
+ pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+ return ret;
+ }
+
+ sg_policy->thread = thread;
+ kthread_bind_mask(thread, policy->related_cpus);
+ init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+ mutex_init(&sg_policy->work_lock);
+
+ wake_up_process(thread);
+
+ return 0;
+}
+
+static void sugov_kthread_stop(struct sugov_policy *sg_policy)
+{
+ /* kthread only required for slow path */
+ if (sg_policy->policy->fast_switch_enabled)
+ return;
+
+ kthread_flush_worker(&sg_policy->worker);
+ kthread_stop(sg_policy->thread);
+ mutex_destroy(&sg_policy->work_lock);
+}
+
static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
{
struct sugov_tunables *tunables;
@@ -352,16 +480,24 @@ static int sugov_init(struct cpufreq_policy *policy)
if (policy->governor_data)
return -EBUSY;
+ cpufreq_enable_fast_switch(policy);
+
sg_policy = sugov_policy_alloc(policy);
- if (!sg_policy)
- return -ENOMEM;
+ if (!sg_policy) {
+ ret = -ENOMEM;
+ goto disable_fast_switch;
+ }
+
+ ret = sugov_kthread_create(sg_policy);
+ if (ret)
+ goto free_sg_policy;
mutex_lock(&global_tunables_lock);
if (global_tunables) {
if (WARN_ON(have_governor_per_policy())) {
ret = -EINVAL;
- goto free_sg_policy;
+ goto stop_kthread;
}
policy->governor_data = sg_policy;
sg_policy->tunables = global_tunables;
@@ -373,7 +509,7 @@ static int sugov_init(struct cpufreq_policy *policy)
tunables = sugov_tunables_alloc(sg_policy);
if (!tunables) {
ret = -ENOMEM;
- goto free_sg_policy;
+ goto stop_kthread;
}
tunables->rate_limit_us = LATENCY_MULTIPLIER;
@@ -390,20 +526,25 @@ static int sugov_init(struct cpufreq_policy *policy)
if (ret)
goto fail;
- out:
+out:
mutex_unlock(&global_tunables_lock);
-
- cpufreq_enable_fast_switch(policy);
return 0;
- fail:
+fail:
policy->governor_data = NULL;
sugov_tunables_free(tunables);
- free_sg_policy:
+stop_kthread:
+ sugov_kthread_stop(sg_policy);
+
+free_sg_policy:
mutex_unlock(&global_tunables_lock);
sugov_policy_free(sg_policy);
+
+disable_fast_switch:
+ cpufreq_disable_fast_switch(policy);
+
pr_err("initialization failed (error %d)\n", ret);
return ret;
}
@@ -414,8 +555,6 @@ static void sugov_exit(struct cpufreq_policy *policy)
struct sugov_tunables *tunables = sg_policy->tunables;
unsigned int count;
- cpufreq_disable_fast_switch(policy);
-
mutex_lock(&global_tunables_lock);
count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
@@ -425,7 +564,9 @@ static void sugov_exit(struct cpufreq_policy *policy)
mutex_unlock(&global_tunables_lock);
+ sugov_kthread_stop(sg_policy);
sugov_policy_free(sg_policy);
+ cpufreq_disable_fast_switch(policy);
}
static int sugov_start(struct cpufreq_policy *policy)
@@ -444,10 +585,13 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_cpu->sg_policy = sg_policy;
if (policy_is_shared(policy)) {
- sg_cpu->util = ULONG_MAX;
+ sg_cpu->util = 0;
sg_cpu->max = 0;
+ sg_cpu->flags = SCHED_CPUFREQ_RT;
sg_cpu->last_update = 0;
sg_cpu->cached_raw_freq = 0;
+ sg_cpu->iowait_boost = 0;
+ sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
sugov_update_shared);
} else {
@@ -468,8 +612,10 @@ static void sugov_stop(struct cpufreq_policy *policy)
synchronize_sched();
- irq_work_sync(&sg_policy->irq_work);
- cancel_work_sync(&sg_policy->work);
+ if (!policy->fast_switch_enabled) {
+ irq_work_sync(&sg_policy->irq_work);
+ kthread_cancel_work_sync(&sg_policy->work);
+ }
}
static void sugov_limits(struct cpufreq_policy *policy)
@@ -495,28 +641,15 @@ static struct cpufreq_governor schedutil_gov = {
.limits = sugov_limits,
};
-static int __init sugov_module_init(void)
-{
- return cpufreq_register_governor(&schedutil_gov);
-}
-
-static void __exit sugov_module_exit(void)
-{
- cpufreq_unregister_governor(&schedutil_gov);
-}
-
-MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>");
-MODULE_DESCRIPTION("Utilization-based CPU frequency selection");
-MODULE_LICENSE("GPL");
-
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
struct cpufreq_governor *cpufreq_default_governor(void)
{
return &schedutil_gov;
}
-
-fs_initcall(sugov_module_init);
-#else
-module_init(sugov_module_init);
#endif
-module_exit(sugov_module_exit);
+
+static int __init sugov_register(void)
+{
+ return cpufreq_register_governor(&schedutil_gov);
+}
+fs_initcall(sugov_register);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a846cf89eb96..2ecec3a4f1ee 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -4,6 +4,7 @@
#include <linux/kernel_stat.h>
#include <linux/static_key.h>
#include <linux/context_tracking.h>
+#include <linux/cputime.h>
#include "sched.h"
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
@@ -23,10 +24,8 @@
* task when irq is in progress while we read rq->clock. That is a worthy
* compromise in place of having locks on each irq in account_system_time.
*/
-DEFINE_PER_CPU(u64, cpu_hardirq_time);
-DEFINE_PER_CPU(u64, cpu_softirq_time);
+DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
-static DEFINE_PER_CPU(u64, irq_start_time);
static int sched_clock_irqtime;
void enable_sched_clock_irqtime(void)
@@ -39,16 +38,14 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
}
-#ifndef CONFIG_64BIT
-DEFINE_PER_CPU(seqcount_t, irq_time_seq);
-#endif /* CONFIG_64BIT */
-
/*
* Called before incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
*/
void irqtime_account_irq(struct task_struct *curr)
{
+ struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
s64 delta;
int cpu;
@@ -56,65 +53,44 @@ void irqtime_account_irq(struct task_struct *curr)
return;
cpu = smp_processor_id();
- delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
- __this_cpu_add(irq_start_time, delta);
+ delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
+ irqtime->irq_start_time += delta;
- irq_time_write_begin();
+ u64_stats_update_begin(&irqtime->sync);
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
* in that case, so as not to confuse scheduler with a special task
* that do not consume any time, but still wants to run.
*/
- if (hardirq_count())
- __this_cpu_add(cpu_hardirq_time, delta);
- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
- __this_cpu_add(cpu_softirq_time, delta);
+ if (hardirq_count()) {
+ cpustat[CPUTIME_IRQ] += delta;
+ irqtime->tick_delta += delta;
+ } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
+ cpustat[CPUTIME_SOFTIRQ] += delta;
+ irqtime->tick_delta += delta;
+ }
- irq_time_write_end();
+ u64_stats_update_end(&irqtime->sync);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
-static cputime_t irqtime_account_hi_update(cputime_t maxtime)
+static u64 irqtime_tick_accounted(u64 maxtime)
{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- unsigned long flags;
- cputime_t irq_cputime;
-
- local_irq_save(flags);
- irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
- cpustat[CPUTIME_IRQ];
- irq_cputime = min(irq_cputime, maxtime);
- cpustat[CPUTIME_IRQ] += irq_cputime;
- local_irq_restore(flags);
- return irq_cputime;
-}
+ struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
+ u64 delta;
-static cputime_t irqtime_account_si_update(cputime_t maxtime)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- unsigned long flags;
- cputime_t softirq_cputime;
+ delta = min(irqtime->tick_delta, maxtime);
+ irqtime->tick_delta -= delta;
- local_irq_save(flags);
- softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) -
- cpustat[CPUTIME_SOFTIRQ];
- softirq_cputime = min(softirq_cputime, maxtime);
- cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
- local_irq_restore(flags);
- return softirq_cputime;
+ return delta;
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
#define sched_clock_irqtime (0)
-static cputime_t irqtime_account_hi_update(cputime_t dummy)
-{
- return 0;
-}
-
-static cputime_t irqtime_account_si_update(cputime_t dummy)
+static u64 irqtime_tick_accounted(u64 dummy)
{
return 0;
}
@@ -139,22 +115,19 @@ static inline void task_group_account_field(struct task_struct *p, int index,
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
* @cputime: the cpu time spent in user space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
*/
-void account_user_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled)
+void account_user_time(struct task_struct *p, u64 cputime)
{
int index;
/* Add user time to process. */
p->utime += cputime;
- p->utimescaled += cputime_scaled;
account_group_user_time(p, cputime);
index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
/* Add user time to cpustat. */
- task_group_account_field(p, index, (__force u64) cputime);
+ task_group_account_field(p, index, cputime);
/* Account for user time used */
acct_account_cputime(p);
@@ -164,26 +137,23 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
* Account guest cpu time to a process.
* @p: the process that the cpu time gets accounted to
* @cputime: the cpu time spent in virtual machine since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
*/
-static void account_guest_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled)
+void account_guest_time(struct task_struct *p, u64 cputime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
/* Add guest time to process. */
p->utime += cputime;
- p->utimescaled += cputime_scaled;
account_group_user_time(p, cputime);
p->gtime += cputime;
/* Add guest time to cpustat. */
if (task_nice(p) > 0) {
- cpustat[CPUTIME_NICE] += (__force u64) cputime;
- cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
+ cpustat[CPUTIME_NICE] += cputime;
+ cpustat[CPUTIME_GUEST_NICE] += cputime;
} else {
- cpustat[CPUTIME_USER] += (__force u64) cputime;
- cpustat[CPUTIME_GUEST] += (__force u64) cputime;
+ cpustat[CPUTIME_USER] += cputime;
+ cpustat[CPUTIME_GUEST] += cputime;
}
}
@@ -191,20 +161,17 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
* Account system cpu time to a process and desired cpustat field
* @p: the process that the cpu time gets accounted to
* @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * @target_cputime64: pointer to cpustat field that has to be updated
+ * @index: pointer to cpustat field that has to be updated
*/
-static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled, int index)
+void account_system_index_time(struct task_struct *p,
+ u64 cputime, enum cpu_usage_stat index)
{
/* Add system time to process. */
p->stime += cputime;
- p->stimescaled += cputime_scaled;
account_group_system_time(p, cputime);
/* Add system time to cpustat. */
- task_group_account_field(p, index, (__force u64) cputime);
+ task_group_account_field(p, index, cputime);
/* Account for system time used */
acct_account_cputime(p);
@@ -215,15 +182,13 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
* @p: the process that the cpu time gets accounted to
* @hardirq_offset: the offset to subtract from hardirq_count()
* @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
*/
-void account_system_time(struct task_struct *p, int hardirq_offset,
- cputime_t cputime, cputime_t cputime_scaled)
+void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
{
int index;
if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
- account_guest_time(p, cputime, cputime_scaled);
+ account_guest_time(p, cputime);
return;
}
@@ -234,33 +199,33 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
else
index = CPUTIME_SYSTEM;
- __account_system_time(p, cputime, cputime_scaled, index);
+ account_system_index_time(p, cputime, index);
}
/*
* Account for involuntary wait time.
* @cputime: the cpu time spent in involuntary wait
*/
-void account_steal_time(cputime_t cputime)
+void account_steal_time(u64 cputime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
- cpustat[CPUTIME_STEAL] += (__force u64) cputime;
+ cpustat[CPUTIME_STEAL] += cputime;
}
/*
* Account for idle time.
* @cputime: the cpu time spent in idle wait
*/
-void account_idle_time(cputime_t cputime)
+void account_idle_time(u64 cputime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
struct rq *rq = this_rq();
if (atomic_read(&rq->nr_iowait) > 0)
- cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
+ cpustat[CPUTIME_IOWAIT] += cputime;
else
- cpustat[CPUTIME_IDLE] += (__force u64) cputime;
+ cpustat[CPUTIME_IDLE] += cputime;
}
/*
@@ -268,21 +233,19 @@ void account_idle_time(cputime_t cputime)
* ticks are not redelivered later. Due to that, this function may on
* occasion account more time than the calling functions think elapsed.
*/
-static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
+static __always_inline u64 steal_account_process_time(u64 maxtime)
{
#ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) {
- cputime_t steal_cputime;
u64 steal;
steal = paravirt_steal_clock(smp_processor_id());
steal -= this_rq()->prev_steal_time;
+ steal = min(steal, maxtime);
+ account_steal_time(steal);
+ this_rq()->prev_steal_time += steal;
- steal_cputime = min(nsecs_to_cputime(steal), maxtime);
- account_steal_time(steal_cputime);
- this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime);
-
- return steal_cputime;
+ return steal;
}
#endif
return 0;
@@ -291,21 +254,41 @@ static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
/*
* Account how much elapsed time was spent in steal, irq, or softirq time.
*/
-static inline cputime_t account_other_time(cputime_t max)
+static inline u64 account_other_time(u64 max)
{
- cputime_t accounted;
+ u64 accounted;
- accounted = steal_account_process_time(max);
+ /* Shall be converted to a lockdep-enabled lightweight check */
+ WARN_ON_ONCE(!irqs_disabled());
- if (accounted < max)
- accounted += irqtime_account_hi_update(max - accounted);
+ accounted = steal_account_process_time(max);
if (accounted < max)
- accounted += irqtime_account_si_update(max - accounted);
+ accounted += irqtime_tick_accounted(max - accounted);
return accounted;
}
+#ifdef CONFIG_64BIT
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+ return t->se.sum_exec_runtime;
+}
+#else
+static u64 read_sum_exec_runtime(struct task_struct *t)
+{
+ u64 ns;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(t, &rf);
+ ns = t->se.sum_exec_runtime;
+ task_rq_unlock(rq, t, &rf);
+
+ return ns;
+}
+#endif
+
/*
* Accumulate raw cputime values of dead tasks (sig->[us]time) and live
* tasks (sum on group iteration) belonging to @tsk's group.
@@ -313,11 +296,22 @@ static inline cputime_t account_other_time(cputime_t max)
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
{
struct signal_struct *sig = tsk->signal;
- cputime_t utime, stime;
+ u64 utime, stime;
struct task_struct *t;
unsigned int seq, nextseq;
unsigned long flags;
+ /*
+ * Update current task runtime to account pending time since last
+ * scheduler action or thread_group_cputime() call. This thread group
+ * might have other running tasks on different CPUs, but updating
+ * their runtime can affect syscall performance, so we skip account
+ * those pending times and rely only on values updated on tick or
+ * other scheduler action.
+ */
+ if (same_thread_group(current, tsk))
+ (void) task_sched_runtime(current);
+
rcu_read_lock();
/* Attempt a lockless read on the first round. */
nextseq = 0;
@@ -332,7 +326,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
task_cputime(t, &utime, &stime);
times->utime += utime;
times->stime += stime;
- times->sum_exec_runtime += task_sched_runtime(t);
+ times->sum_exec_runtime += read_sum_exec_runtime(t);
}
/* If lockless access failed, take the lock. */
nextseq = 1;
@@ -366,8 +360,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
struct rq *rq, int ticks)
{
- u64 cputime = (__force u64) cputime_one_jiffy * ticks;
- cputime_t scaled, other;
+ u64 other, cputime = TICK_NSEC * ticks;
/*
* When returning from idle, many ticks can get accounted at
@@ -379,8 +372,8 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
other = account_other_time(ULONG_MAX);
if (other >= cputime)
return;
+
cputime -= other;
- scaled = cputime_to_scaled(cputime);
if (this_cpu_ksoftirqd() == p) {
/*
@@ -388,15 +381,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
* So, we have to handle it separately here.
* Also, p->stime needs to be updated for ksoftirqd.
*/
- __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
+ account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
} else if (user_tick) {
- account_user_time(p, cputime, scaled);
+ account_user_time(p, cputime);
} else if (p == rq->idle) {
account_idle_time(cputime);
} else if (p->flags & PF_VCPU) { /* System time or guest time */
- account_guest_time(p, cputime, scaled);
+ account_guest_time(p, cputime);
} else {
- __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
+ account_system_index_time(p, cputime, CPUTIME_SYSTEM);
}
}
@@ -425,9 +418,7 @@ void vtime_common_task_switch(struct task_struct *prev)
else
vtime_account_system(prev);
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
- vtime_account_user(prev);
-#endif
+ vtime_flush(prev);
arch_vtime_task_switch(prev);
}
#endif
@@ -455,14 +446,14 @@ void vtime_account_irq_enter(struct task_struct *tsk)
EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
{
*ut = p->utime;
*st = p->stime;
}
EXPORT_SYMBOL_GPL(task_cputime_adjusted);
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
{
struct task_cputime cputime;
@@ -479,7 +470,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
*/
void account_process_tick(struct task_struct *p, int user_tick)
{
- cputime_t cputime, scaled, steal;
+ u64 cputime, steal;
struct rq *rq = this_rq();
if (vtime_accounting_cpu_enabled())
@@ -490,19 +481,18 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
}
- cputime = cputime_one_jiffy;
+ cputime = TICK_NSEC;
steal = steal_account_process_time(ULONG_MAX);
if (steal >= cputime)
return;
cputime -= steal;
- scaled = cputime_to_scaled(cputime);
if (user_tick)
- account_user_time(p, cputime, scaled);
+ account_user_time(p, cputime);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
- account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);
+ account_system_time(p, HARDIRQ_OFFSET, cputime);
else
account_idle_time(cputime);
}
@@ -513,14 +503,14 @@ void account_process_tick(struct task_struct *p, int user_tick)
*/
void account_idle_ticks(unsigned long ticks)
{
- cputime_t cputime, steal;
+ u64 cputime, steal;
if (sched_clock_irqtime) {
irqtime_account_idle_ticks(ticks);
return;
}
- cputime = jiffies_to_cputime(ticks);
+ cputime = ticks * TICK_NSEC;
steal = steal_account_process_time(ULONG_MAX);
if (steal >= cputime)
@@ -534,7 +524,7 @@ void account_idle_ticks(unsigned long ticks)
* Perform (stime * rtime) / total, but avoid multiplication overflow by
* loosing precision when the numbers are big.
*/
-static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
+static u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
u64 scaled;
@@ -571,7 +561,7 @@ drop_precision:
* followed by a 64/32->64 divide.
*/
scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
- return (__force cputime_t) scaled;
+ return scaled;
}
/*
@@ -596,14 +586,14 @@ drop_precision:
*/
static void cputime_adjust(struct task_cputime *curr,
struct prev_cputime *prev,
- cputime_t *ut, cputime_t *st)
+ u64 *ut, u64 *st)
{
- cputime_t rtime, stime, utime;
+ u64 rtime, stime, utime;
unsigned long flags;
/* Serialize concurrent callers such that we can honour our guarantees */
raw_spin_lock_irqsave(&prev->lock, flags);
- rtime = nsecs_to_cputime(curr->sum_exec_runtime);
+ rtime = curr->sum_exec_runtime;
/*
* This is possible under two circumstances:
@@ -634,8 +624,7 @@ static void cputime_adjust(struct task_cputime *curr,
goto update;
}
- stime = scale_stime((__force u64)stime, (__force u64)rtime,
- (__force u64)(stime + utime));
+ stime = scale_stime(stime, rtime, stime + utime);
update:
/*
@@ -668,7 +657,7 @@ out:
raw_spin_unlock_irqrestore(&prev->lock, flags);
}
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
{
struct task_cputime cputime = {
.sum_exec_runtime = p->se.sum_exec_runtime,
@@ -679,7 +668,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
}
EXPORT_SYMBOL_GPL(task_cputime_adjusted);
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
{
struct task_cputime cputime;
@@ -689,20 +678,20 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static cputime_t vtime_delta(struct task_struct *tsk)
+static u64 vtime_delta(struct task_struct *tsk)
{
unsigned long now = READ_ONCE(jiffies);
if (time_before(now, (unsigned long)tsk->vtime_snap))
return 0;
- return jiffies_to_cputime(now - tsk->vtime_snap);
+ return jiffies_to_nsecs(now - tsk->vtime_snap);
}
-static cputime_t get_vtime_delta(struct task_struct *tsk)
+static u64 get_vtime_delta(struct task_struct *tsk)
{
unsigned long now = READ_ONCE(jiffies);
- cputime_t delta, other;
+ u64 delta, other;
/*
* Unlike tick based timing, vtime based timing never has lost
@@ -711,7 +700,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
* elapsed time. Limit account_other_time to prevent rounding
* errors from causing elapsed vtime to go negative.
*/
- delta = jiffies_to_cputime(now - tsk->vtime_snap);
+ delta = jiffies_to_nsecs(now - tsk->vtime_snap);
other = account_other_time(delta);
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
tsk->vtime_snap = now;
@@ -721,9 +710,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
static void __vtime_account_system(struct task_struct *tsk)
{
- cputime_t delta_cpu = get_vtime_delta(tsk);
-
- account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
+ account_system_time(tsk, irq_count(), get_vtime_delta(tsk));
}
void vtime_account_system(struct task_struct *tsk)
@@ -738,14 +725,10 @@ void vtime_account_system(struct task_struct *tsk)
void vtime_account_user(struct task_struct *tsk)
{
- cputime_t delta_cpu;
-
write_seqcount_begin(&tsk->vtime_seqcount);
tsk->vtime_snap_whence = VTIME_SYS;
- if (vtime_delta(tsk)) {
- delta_cpu = get_vtime_delta(tsk);
- account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
- }
+ if (vtime_delta(tsk))
+ account_user_time(tsk, get_vtime_delta(tsk));
write_seqcount_end(&tsk->vtime_seqcount);
}
@@ -786,9 +769,7 @@ EXPORT_SYMBOL_GPL(vtime_guest_exit);
void vtime_account_idle(struct task_struct *tsk)
{
- cputime_t delta_cpu = get_vtime_delta(tsk);
-
- account_idle_time(delta_cpu);
+ account_idle_time(get_vtime_delta(tsk));
}
void arch_vtime_task_switch(struct task_struct *prev)
@@ -815,10 +796,10 @@ void vtime_init_idle(struct task_struct *t, int cpu)
local_irq_restore(flags);
}
-cputime_t task_gtime(struct task_struct *t)
+u64 task_gtime(struct task_struct *t)
{
unsigned int seq;
- cputime_t gtime;
+ u64 gtime;
if (!vtime_accounting_enabled())
return t->gtime;
@@ -840,29 +821,25 @@ cputime_t task_gtime(struct task_struct *t)
* add up the pending nohz execution time since the last
* cputime snapshot.
*/
-static void
-fetch_task_cputime(struct task_struct *t,
- cputime_t *u_dst, cputime_t *s_dst,
- cputime_t *u_src, cputime_t *s_src,
- cputime_t *udelta, cputime_t *sdelta)
+void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
{
+ u64 delta;
unsigned int seq;
- unsigned long long delta;
- do {
- *udelta = 0;
- *sdelta = 0;
+ if (!vtime_accounting_enabled()) {
+ *utime = t->utime;
+ *stime = t->stime;
+ return;
+ }
+ do {
seq = read_seqcount_begin(&t->vtime_seqcount);
- if (u_dst)
- *u_dst = *u_src;
- if (s_dst)
- *s_dst = *s_src;
+ *utime = t->utime;
+ *stime = t->stime;
/* Task is sleeping, nothing to add */
- if (t->vtime_snap_whence == VTIME_INACTIVE ||
- is_idle_task(t))
+ if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
continue;
delta = vtime_delta(t);
@@ -871,54 +848,10 @@ fetch_task_cputime(struct task_struct *t,
* Task runs either in user or kernel space, add pending nohz time to
* the right place.
*/
- if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
- *udelta = delta;
- } else {
- if (t->vtime_snap_whence == VTIME_SYS)
- *sdelta = delta;
- }
+ if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU)
+ *utime += delta;
+ else if (t->vtime_snap_whence == VTIME_SYS)
+ *stime += delta;
} while (read_seqcount_retry(&t->vtime_seqcount, seq));
}
-
-
-void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
-{
- cputime_t udelta, sdelta;
-
- if (!vtime_accounting_enabled()) {
- if (utime)
- *utime = t->utime;
- if (stime)
- *stime = t->stime;
- return;
- }
-
- fetch_task_cputime(t, utime, stime, &t->utime,
- &t->stime, &udelta, &sdelta);
- if (utime)
- *utime += udelta;
- if (stime)
- *stime += sdelta;
-}
-
-void task_cputime_scaled(struct task_struct *t,
- cputime_t *utimescaled, cputime_t *stimescaled)
-{
- cputime_t udelta, sdelta;
-
- if (!vtime_accounting_enabled()) {
- if (utimescaled)
- *utimescaled = t->utimescaled;
- if (stimescaled)
- *stimescaled = t->stimescaled;
- return;
- }
-
- fetch_task_cputime(t, utimescaled, stimescaled,
- &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
- if (utimescaled)
- *utimescaled += cputime_to_scaled(udelta);
- if (stimescaled)
- *stimescaled += cputime_to_scaled(sdelta);
-}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 1ce8867283dc..27737f34757d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
{
struct rq *later_rq = NULL;
- bool fallback = false;
later_rq = find_lock_later_rq(p, rq);
-
if (!later_rq) {
int cpu;
@@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
* If we cannot preempt any rq, fall back to pick any
* online cpu.
*/
- fallback = true;
cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
if (cpu >= nr_cpu_ids) {
/*
@@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
double_lock_balance(rq, later_rq);
}
- /*
- * By now the task is replenished and enqueued; migrate it.
- */
- deactivate_task(rq, p, 0);
set_task_cpu(p, later_rq->cpu);
- activate_task(later_rq, p, 0);
-
- if (!fallback)
- resched_curr(later_rq);
-
double_unlock_balance(later_rq, rq);
return later_rq;
@@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
* one, and to (try to!) reconcile itself with its own scheduling
* parameters.
*/
-static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se)
+static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
+ WARN_ON(dl_se->dl_boosted);
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
/*
@@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
* future; in fact, we must consider execution overheads (time
* spent on hardirq context, etc.).
*/
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
- dl_se->runtime = pi_se->dl_runtime;
+ dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
+ dl_se->runtime = dl_se->dl_runtime;
}
/*
@@ -598,7 +586,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
/*
* The task might have changed its scheduling policy to something
- * different than SCHED_DEADLINE (through switched_fromd_dl()).
+ * different than SCHED_DEADLINE (through switched_from_dl()).
*/
if (!dl_task(p)) {
__dl_clear_params(p);
@@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
goto unlock;
}
- enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
- if (dl_task(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
- else
- resched_curr(rq);
-
#ifdef CONFIG_SMP
- /*
- * Perform balancing operations here; after the replenishments. We
- * cannot drop rq->lock before this, otherwise the assertion in
- * start_dl_timer() about not missing updates is not true.
- *
- * If we find that the rq the task was on is no longer available, we
- * need to select a new rq.
- *
- * XXX figure out if select_task_rq_dl() deals with offline cpus.
- */
if (unlikely(!rq->online)) {
+ /*
+ * If the runqueue is no longer available, migrate the
+ * task elsewhere. This necessarily changes rq.
+ */
lockdep_unpin_lock(&rq->lock, rf.cookie);
rq = dl_task_offline_migration(rq, p);
rf.cookie = lockdep_pin_lock(&rq->lock);
+
+ /*
+ * Now that the task has been migrated to the new RQ and we
+ * have that locked, proceed as normal and enqueue the task
+ * there.
+ */
}
+#endif
+ enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+ if (dl_task(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_curr(rq);
+
+#ifdef CONFIG_SMP
/*
* Queueing this task back might have overloaded rq, check if we need
* to kick someone away.
@@ -673,9 +663,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* Nothing relies on rq->lock after this, so its safe to drop
* rq->lock.
*/
- lockdep_unpin_lock(&rq->lock, rf.cookie);
+ rq_unpin_lock(rq, &rf);
push_dl_task(rq);
- lockdep_repin_lock(&rq->lock, rf.cookie);
+ rq_repin_lock(rq, &rf);
}
#endif
@@ -735,9 +725,8 @@ static void update_curr_dl(struct rq *rq)
return;
}
- /* kick cpufreq (see the comment in linux/cpufreq.h). */
- if (cpu_of(rq) == smp_processor_id())
- cpufreq_trigger_update(rq_clock(rq));
+ /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -798,7 +787,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
if (dl_rq->earliest_dl.curr == 0 ||
dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
dl_rq->earliest_dl.curr = deadline;
- cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
+ cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
}
}
@@ -813,14 +802,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
if (!dl_rq->dl_nr_running) {
dl_rq->earliest_dl.curr = 0;
dl_rq->earliest_dl.next = 0;
- cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+ cpudl_clear(&rq->rd->cpudl, rq->cpu);
} else {
struct rb_node *leftmost = dl_rq->rb_leftmost;
struct sched_dl_entity *entry;
entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
dl_rq->earliest_dl.curr = entry->deadline;
- cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
+ cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
}
}
@@ -1129,7 +1118,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
}
struct task_struct *
-pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct sched_dl_entity *dl_se;
struct task_struct *p;
@@ -1144,11 +1133,11 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie coo
* disabled avoiding further scheduler activity on it and we're
* being very careful to re-start the picking loop.
*/
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, rf);
pull_dl_task(rq);
- lockdep_repin_lock(&rq->lock, cookie);
+ rq_repin_lock(rq, rf);
/*
- * pull_rt_task() can drop (and re-acquire) rq->lock; this
+ * pull_dl_task() can drop (and re-acquire) rq->lock; this
* means a stop task can slip in, in which case we need to
* re-start task selection.
*/
@@ -1671,7 +1660,7 @@ static void rq_online_dl(struct rq *rq)
cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
if (rq->dl.dl_nr_running > 0)
- cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
+ cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
}
/* Assumes rq->lock is held */
@@ -1680,7 +1669,7 @@ static void rq_offline_dl(struct rq *rq)
if (rq->dl.overloaded)
dl_clear_overload(rq);
- cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+ cpudl_clear(&rq->rd->cpudl, rq->cpu);
cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
}
@@ -1723,19 +1712,28 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/
static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
+
+ /* If p is not queued we will update its parameters at next wakeup. */
+ if (!task_on_rq_queued(p))
+ return;
+
+ /*
+ * If p is boosted we already updated its params in
+ * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
+ * p's deadline being now already after rq_clock(rq).
+ */
if (dl_time_before(p->dl.deadline, rq_clock(rq)))
- setup_new_dl_entity(&p->dl, &p->dl);
+ setup_new_dl_entity(&p->dl);
- if (task_on_rq_queued(p) && rq->curr != p) {
+ if (rq->curr != p) {
#ifdef CONFIG_SMP
if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
queue_push_tasks(rq);
-#else
+#endif
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
-#endif
}
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2a0a9995256d..109adc0e9cb9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#define P(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
+#define P_SCHEDSTAT(F) \
+ SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN_SCHEDSTAT(F) \
+ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
if (!se)
return;
@@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN(se->exec_start);
PN(se->vruntime);
PN(se->sum_exec_runtime);
-#ifdef CONFIG_SCHEDSTATS
if (schedstat_enabled()) {
- PN(se->statistics.wait_start);
- PN(se->statistics.sleep_start);
- PN(se->statistics.block_start);
- PN(se->statistics.sleep_max);
- PN(se->statistics.block_max);
- PN(se->statistics.exec_max);
- PN(se->statistics.slice_max);
- PN(se->statistics.wait_max);
- PN(se->statistics.wait_sum);
- P(se->statistics.wait_count);
+ PN_SCHEDSTAT(se->statistics.wait_start);
+ PN_SCHEDSTAT(se->statistics.sleep_start);
+ PN_SCHEDSTAT(se->statistics.block_start);
+ PN_SCHEDSTAT(se->statistics.sleep_max);
+ PN_SCHEDSTAT(se->statistics.block_max);
+ PN_SCHEDSTAT(se->statistics.exec_max);
+ PN_SCHEDSTAT(se->statistics.slice_max);
+ PN_SCHEDSTAT(se->statistics.wait_max);
+ PN_SCHEDSTAT(se->statistics.wait_sum);
+ P_SCHEDSTAT(se->statistics.wait_count);
}
-#endif
P(se->load.weight);
#ifdef CONFIG_SMP
P(se->avg.load_avg);
P(se->avg.util_avg);
#endif
+
+#undef PN_SCHEDSTAT
#undef PN
+#undef P_SCHEDSTAT
#undef P
}
#endif
@@ -410,7 +415,8 @@ static char *task_group_path(struct task_group *tg)
if (autogroup_path(tg, group_path, PATH_MAX))
return group_path;
- return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+ cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+ return group_path;
}
#endif
@@ -429,9 +435,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
p->prio);
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)),
+ SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)),
SPLIT_NS(p->se.sum_exec_runtime),
- SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime)));
+ SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime)));
#ifdef CONFIG_NUMA_BALANCING
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
@@ -626,9 +632,7 @@ do { \
#undef P64
#endif
-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
-
+#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n));
if (schedstat_enabled()) {
P(yld_count);
P(sched_count);
@@ -636,9 +640,8 @@ do { \
P(ttwu_count);
P(ttwu_local);
}
-
#undef P
-#endif
+
spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
@@ -868,10 +871,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
#define P(F) \
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define P_SCHEDSTAT(F) \
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
#define __PN(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+#define PN_SCHEDSTAT(F) \
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
PN(se.exec_start);
PN(se.vruntime);
@@ -881,37 +888,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.nr_migrations);
-#ifdef CONFIG_SCHEDSTATS
if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu;
- PN(se.statistics.sum_sleep_runtime);
- PN(se.statistics.wait_start);
- PN(se.statistics.sleep_start);
- PN(se.statistics.block_start);
- PN(se.statistics.sleep_max);
- PN(se.statistics.block_max);
- PN(se.statistics.exec_max);
- PN(se.statistics.slice_max);
- PN(se.statistics.wait_max);
- PN(se.statistics.wait_sum);
- P(se.statistics.wait_count);
- PN(se.statistics.iowait_sum);
- P(se.statistics.iowait_count);
- P(se.statistics.nr_migrations_cold);
- P(se.statistics.nr_failed_migrations_affine);
- P(se.statistics.nr_failed_migrations_running);
- P(se.statistics.nr_failed_migrations_hot);
- P(se.statistics.nr_forced_migrations);
- P(se.statistics.nr_wakeups);
- P(se.statistics.nr_wakeups_sync);
- P(se.statistics.nr_wakeups_migrate);
- P(se.statistics.nr_wakeups_local);
- P(se.statistics.nr_wakeups_remote);
- P(se.statistics.nr_wakeups_affine);
- P(se.statistics.nr_wakeups_affine_attempts);
- P(se.statistics.nr_wakeups_passive);
- P(se.statistics.nr_wakeups_idle);
+ PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
+ PN_SCHEDSTAT(se.statistics.wait_start);
+ PN_SCHEDSTAT(se.statistics.sleep_start);
+ PN_SCHEDSTAT(se.statistics.block_start);
+ PN_SCHEDSTAT(se.statistics.sleep_max);
+ PN_SCHEDSTAT(se.statistics.block_max);
+ PN_SCHEDSTAT(se.statistics.exec_max);
+ PN_SCHEDSTAT(se.statistics.slice_max);
+ PN_SCHEDSTAT(se.statistics.wait_max);
+ PN_SCHEDSTAT(se.statistics.wait_sum);
+ P_SCHEDSTAT(se.statistics.wait_count);
+ PN_SCHEDSTAT(se.statistics.iowait_sum);
+ P_SCHEDSTAT(se.statistics.iowait_count);
+ P_SCHEDSTAT(se.statistics.nr_migrations_cold);
+ P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
+ P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
+ P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
+ P_SCHEDSTAT(se.statistics.nr_forced_migrations);
+ P_SCHEDSTAT(se.statistics.nr_wakeups);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_local);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
@@ -930,7 +936,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
__PN(avg_atom);
__PN(avg_per_cpu);
}
-#endif
+
__P(nr_switches);
SEQ_printf(m, "%-45s:%21Ld\n",
"nr_voluntary_switches", (long long)p->nvcsw);
@@ -947,8 +953,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
#endif
P(policy);
P(prio);
+ if (p->policy == SCHED_DEADLINE) {
+ P(dl.runtime);
+ P(dl.deadline);
+ }
+#undef PN_SCHEDSTAT
#undef PN
#undef __PN
+#undef P_SCHEDSTAT
#undef P
#undef __P
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 039de34f1521..274c747a01ce 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -37,7 +37,6 @@
/*
* Targeted preemption latency for CPU-bound tasks:
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length
@@ -46,31 +45,35 @@
*
* (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field)
+ *
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_latency = 6000000ULL;
-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+unsigned int sysctl_sched_latency = 6000000ULL;
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
/*
* The initial- and re-scaling of tunables is configurable
- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*
* Options are:
- * SCHED_TUNABLESCALING_NONE - unscaled, always *1
- * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
- * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ *
+ * SCHED_TUNABLESCALING_NONE - unscaled, always *1
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ *
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*/
-enum sched_tunable_scaling sysctl_sched_tunable_scaling
- = SCHED_TUNABLESCALING_LOG;
+enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
/*
* Minimal preemption granularity for CPU-bound tasks:
+ *
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 750000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
+unsigned int sysctl_sched_min_granularity = 750000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
/*
- * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
+ * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
*/
static unsigned int sched_nr_latency = 8;
@@ -82,23 +85,27 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
/*
* SCHED_OTHER wake-up granularity.
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies.
+ *
+ * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
-const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+#ifdef CONFIG_SMP
/*
- * The exponential sliding window over which load is averaged for shares
- * distribution.
- * (default: 10msec)
+ * For asym packing, by default the lower numbered cpu has higher priority.
*/
-unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+int __weak arch_asym_cpu_priority(int cpu)
+{
+ return -cpu;
+}
+#endif
#ifdef CONFIG_CFS_BANDWIDTH
/*
@@ -109,11 +116,19 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
* to consumption or the quota being specified to be smaller than the slice)
* we will always only issue the remaining available time.
*
- * default: 5 msec, units: microseconds
- */
-unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+ * (default: 5 msec, units: microseconds)
+ */
+unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * margin < capacity * 1024
+ *
+ * (default: ~20%)
+ */
+unsigned int capacity_margin = 1280;
+
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
@@ -256,9 +271,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
static inline struct task_struct *task_of(struct sched_entity *se)
{
-#ifdef CONFIG_SCHED_DEBUG
- WARN_ON_ONCE(!entity_is_task(se));
-#endif
+ SCHED_WARN_ON(!entity_is_task(se));
return container_of(se, struct task_struct, se);
}
@@ -286,19 +299,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
/*
* Ensure we either appear before our parent (if already
* enqueued) or force our parent to appear after us when it is
- * enqueued. The fact that we always enqueue bottom-up
- * reduces this to two cases.
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases and a special case for the root
+ * cfs_rq. Furthermore, it also means that we will always reset
+ * tmp_alone_branch either when the branch is connected
+ * to a tree or when we reach the beg of the tree
*/
if (cfs_rq->tg->parent &&
- cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
- } else {
+ cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+ /*
+ * If parent is already on the list, we add the child
+ * just before. Thanks to circular linked property of
+ * the list, this means to put the child at the tail
+ * of the list that starts by parent.
+ */
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+ /*
+ * The branch is now connected to its tree so we can
+ * reset tmp_alone_branch to the beginning of the
+ * list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ } else if (!cfs_rq->tg->parent) {
+ /*
+ * cfs rq without parent should be put
+ * at the tail of the list.
+ */
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ &rq->leaf_cfs_rq_list);
+ /*
+ * We have reach the beg of a tree so we can reset
+ * tmp_alone_branch to the beginning of the list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ } else {
+ /*
+ * The parent has not already been added so we want to
+ * make sure that it will be put after us.
+ * tmp_alone_branch points to the beg of the branch
+ * where we will add parent.
+ */
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+ rq->tmp_alone_branch);
+ /*
+ * update tmp_alone_branch to points to the new beg
+ * of the branch
+ */
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
}
cfs_rq->on_list = 1;
@@ -456,17 +509,23 @@ static inline int entity_before(struct sched_entity *a,
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
+ struct sched_entity *curr = cfs_rq->curr;
+
u64 vruntime = cfs_rq->min_vruntime;
- if (cfs_rq->curr)
- vruntime = cfs_rq->curr->vruntime;
+ if (curr) {
+ if (curr->on_rq)
+ vruntime = curr->vruntime;
+ else
+ curr = NULL;
+ }
if (cfs_rq->rb_leftmost) {
struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
struct sched_entity,
run_node);
- if (!cfs_rq->curr)
+ if (!curr)
vruntime = se->vruntime;
else
vruntime = min_vruntime(vruntime, se->vruntime);
@@ -656,7 +715,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
/*
@@ -680,7 +739,14 @@ void init_entity_runnable_average(struct sched_entity *se)
* will definitely be update (after enqueue).
*/
sa->period_contrib = 1023;
- sa->load_avg = scale_load_down(se->load.weight);
+ /*
+ * Tasks are intialized with full load to be seen as heavy tasks until
+ * they get a chance to stabilize to their real load level.
+ * Group entities are intialized with zero load to reflect the fact that
+ * nothing has been attached to the task group yet.
+ */
+ if (entity_is_task(se))
+ sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
/*
* At this point, util_avg won't be used in select_task_rq_fair anyway
@@ -691,9 +757,7 @@ void init_entity_runnable_average(struct sched_entity *se)
}
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+static void attach_entity_cfs_rq(struct sched_entity *se);
/*
* With new tasks being created, their initial util_avgs are extrapolated
@@ -725,8 +789,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
- u64 now = cfs_rq_clock_task(cfs_rq);
- int tg_update;
if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) {
@@ -754,15 +816,12 @@ void post_init_entity_util_avg(struct sched_entity *se)
* such that the next switched_to_fair() has the
* expected state.
*/
- se->avg.last_update_time = now;
+ se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
return;
}
}
- tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
- attach_entity_load_avg(cfs_rq, se);
- if (tg_update)
- update_tg_load_avg(cfs_rq, false);
+ attach_entity_cfs_rq(se);
}
#else /* !CONFIG_SMP */
@@ -799,7 +858,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
max(delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
- schedstat_add(cfs_rq, exec_clock, delta_exec);
+ schedstat_add(cfs_rq->exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr);
update_min_vruntime(cfs_rq);
@@ -820,26 +879,34 @@ static void update_curr_fair(struct rq *rq)
update_curr(cfs_rq_of(&rq->curr->se));
}
-#ifdef CONFIG_SCHEDSTATS
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 wait_start = rq_clock(rq_of(cfs_rq));
+ u64 wait_start, prev_wait_start;
+
+ if (!schedstat_enabled())
+ return;
+
+ wait_start = rq_clock(rq_of(cfs_rq));
+ prev_wait_start = schedstat_val(se->statistics.wait_start);
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
- likely(wait_start > se->statistics.wait_start))
- wait_start -= se->statistics.wait_start;
+ likely(wait_start > prev_wait_start))
+ wait_start -= prev_wait_start;
- se->statistics.wait_start = wait_start;
+ schedstat_set(se->statistics.wait_start, wait_start);
}
-static void
+static inline void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct task_struct *p;
u64 delta;
- delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+ if (!schedstat_enabled())
+ return;
+
+ delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
if (entity_is_task(se)) {
p = task_of(se);
@@ -849,35 +916,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
* time stamp can be adjusted to accumulate wait time
* prior to migration.
*/
- se->statistics.wait_start = delta;
+ schedstat_set(se->statistics.wait_start, delta);
return;
}
trace_sched_stat_wait(p, delta);
}
- se->statistics.wait_max = max(se->statistics.wait_max, delta);
- se->statistics.wait_count++;
- se->statistics.wait_sum += delta;
- se->statistics.wait_start = 0;
+ schedstat_set(se->statistics.wait_max,
+ max(schedstat_val(se->statistics.wait_max), delta));
+ schedstat_inc(se->statistics.wait_count);
+ schedstat_add(se->statistics.wait_sum, delta);
+ schedstat_set(se->statistics.wait_start, 0);
+}
+
+static inline void
+update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct task_struct *tsk = NULL;
+ u64 sleep_start, block_start;
+
+ if (!schedstat_enabled())
+ return;
+
+ sleep_start = schedstat_val(se->statistics.sleep_start);
+ block_start = schedstat_val(se->statistics.block_start);
+
+ if (entity_is_task(se))
+ tsk = task_of(se);
+
+ if (sleep_start) {
+ u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
+ schedstat_set(se->statistics.sleep_max, delta);
+
+ schedstat_set(se->statistics.sleep_start, 0);
+ schedstat_add(se->statistics.sum_sleep_runtime, delta);
+
+ if (tsk) {
+ account_scheduler_latency(tsk, delta >> 10, 1);
+ trace_sched_stat_sleep(tsk, delta);
+ }
+ }
+ if (block_start) {
+ u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > schedstat_val(se->statistics.block_max)))
+ schedstat_set(se->statistics.block_max, delta);
+
+ schedstat_set(se->statistics.block_start, 0);
+ schedstat_add(se->statistics.sum_sleep_runtime, delta);
+
+ if (tsk) {
+ if (tsk->in_iowait) {
+ schedstat_add(se->statistics.iowait_sum, delta);
+ schedstat_inc(se->statistics.iowait_count);
+ trace_sched_stat_iowait(tsk, delta);
+ }
+
+ trace_sched_stat_blocked(tsk, delta);
+
+ /*
+ * Blocking time is in units of nanosecs, so shift by
+ * 20 to get a milliseconds-range estimation of the
+ * amount of time that the task spent sleeping:
+ */
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
+ profile_hits(SLEEP_PROFILING,
+ (void *)get_wchan(tsk),
+ delta >> 20);
+ }
+ account_scheduler_latency(tsk, delta >> 10, 0);
+ }
+ }
}
/*
* Task is being enqueued - update stats:
*/
static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+ if (!schedstat_enabled())
+ return;
+
/*
* Are we enqueueing a waiting task? (for current tasks
* a dequeue/enqueue event is a NOP)
*/
if (se != cfs_rq->curr)
update_stats_wait_start(cfs_rq, se);
+
+ if (flags & ENQUEUE_WAKEUP)
+ update_stats_enqueue_sleeper(cfs_rq, se);
}
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+
+ if (!schedstat_enabled())
+ return;
+
/*
* Mark the end of the wait period if dequeueing a
* waiting task:
@@ -885,40 +1031,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se);
- if (flags & DEQUEUE_SLEEP) {
- if (entity_is_task(se)) {
- struct task_struct *tsk = task_of(se);
+ if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
+ struct task_struct *tsk = task_of(se);
- if (tsk->state & TASK_INTERRUPTIBLE)
- se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
- if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->statistics.block_start = rq_clock(rq_of(cfs_rq));
- }
+ if (tsk->state & TASK_INTERRUPTIBLE)
+ schedstat_set(se->statistics.sleep_start,
+ rq_clock(rq_of(cfs_rq)));
+ if (tsk->state & TASK_UNINTERRUPTIBLE)
+ schedstat_set(se->statistics.block_start,
+ rq_clock(rq_of(cfs_rq)));
}
-
-}
-#else
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
}
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-{
-}
-#endif
-
/*
* We are picking a new current task - update its stats:
*/
@@ -1513,8 +1637,16 @@ balance:
* One idle CPU per node is evaluated for a task numa move.
* Call select_idle_sibling to maybe find a better one.
*/
- if (!cur)
- env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+ if (!cur) {
+ /*
+ * select_idle_siblings() uses an per-cpu cpumask that
+ * can be used from IRQ context.
+ */
+ local_irq_disable();
+ env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+ env->dst_cpu);
+ local_irq_enable();
+ }
assign:
task_numa_assign(env, cur, imp);
@@ -2292,7 +2424,7 @@ void task_numa_work(struct callback_head *work)
unsigned long nr_pte_updates = 0;
long pages, virtpages;
- WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+ SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
work->next = work; /* protect against double add */
/*
@@ -2525,6 +2657,18 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
if (tg_weight)
shares /= tg_weight;
+ /*
+ * MIN_SHARES has to be unscaled here to support per-CPU partitioning
+ * of a group with small tg->shares value. It is a floor value which is
+ * assigned as a minimum load.weight to the sched_entity representing
+ * the group on a CPU.
+ *
+ * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
+ * on an 8-core system with 8 tasks each runnable on one CPU shares has
+ * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
+ * case no task is runnable on a CPU MIN_SHARES=2 should be returned
+ * instead of 0.
+ */
if (shares < MIN_SHARES)
shares = MIN_SHARES;
if (shares > tg->shares)
@@ -2557,16 +2701,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
struct task_group *tg;
- struct sched_entity *se;
long shares;
- tg = cfs_rq->tg;
- se = tg->se[cpu_of(rq_of(cfs_rq))];
- if (!se || throttled_hierarchy(cfs_rq))
+ if (!cfs_rq)
+ return;
+
+ if (throttled_hierarchy(cfs_rq))
return;
+
+ tg = cfs_rq->tg;
+
#ifndef CONFIG_SMP
if (likely(se->load.weight == tg->shares))
return;
@@ -2575,8 +2723,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
reweight_entity(cfs_rq_of(se), se, shares);
}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -2802,10 +2951,42 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
return decayed;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
/*
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- * and effective_load (which is not done because it is too costly).
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ \
+ res = var + val; \
+ \
+ if (val < 0 && res > var) \
+ res = 0; \
+ \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/**
+ * update_tg_load_avg - update the tg's load avg
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ * done) and effective_load() (which is not done because it is too costly).
*/
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
@@ -2869,18 +3050,143 @@ void set_task_rq_fair(struct sched_entity *se,
se->avg.last_update_time = n_last_update_time;
}
}
+
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's utilization */
+ se->avg.util_avg = gcfs_rq->avg.util_avg;
+ se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq utilization */
+ add_positive(&cfs_rq->avg.util_avg, delta);
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+/* Take into account change of load of a child task group */
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta, load = gcfs_rq->avg.load_avg;
+
+ /*
+ * If the load of group cfs_rq is null, the load of the
+ * sched_entity will also be null so we can skip the formula
+ */
+ if (load) {
+ long tg_load;
+
+ /* Get tg's load and ensure tg_load > 0 */
+ tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+
+ /* Ensure tg_load >= load and updated with current load*/
+ tg_load -= gcfs_rq->tg_load_avg_contrib;
+ tg_load += load;
+
+ /*
+ * We need to compute a correction term in the case that the
+ * task group is consuming more CPU than a task of equal
+ * weight. A task with a weight equals to tg->shares will have
+ * a load less or equal to scale_load_down(tg->shares).
+ * Similarly, the sched_entities that represent the task group
+ * at parent level, can't have a load higher than
+ * scale_load_down(tg->shares). And the Sum of sched_entities'
+ * load must be <= scale_load_down(tg->shares).
+ */
+ if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
+ /* scale gcfs_rq's load into tg's shares*/
+ load *= scale_load_down(gcfs_rq->tg->shares);
+ load /= tg_load;
+ }
+ }
+
+ delta = load - se->avg.load_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's load */
+ se->avg.load_avg = load;
+ se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq load */
+ add_positive(&cfs_rq->avg.load_avg, delta);
+ cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+
+ /*
+ * If the sched_entity is already enqueued, we also have to update the
+ * runnable load avg.
+ */
+ if (se->on_rq) {
+ /* Update parent cfs_rq runnable_load_avg */
+ add_positive(&cfs_rq->runnable_load_avg, delta);
+ cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+ }
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->propagate_avg = 1;
+}
+
+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
+
+ if (!cfs_rq->propagate_avg)
+ return 0;
+
+ cfs_rq->propagate_avg = 0;
+ return 1;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+
+ if (entity_is_task(se))
+ return 0;
+
+ if (!test_and_clear_tg_cfs_propagate(se))
+ return 0;
+
+ cfs_rq = cfs_rq_of(se);
+
+ set_tg_cfs_propagate(cfs_rq);
+
+ update_tg_cfs_util(cfs_rq, se);
+ update_tg_cfs_load(cfs_rq, se);
+
+ return 1;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
+
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+static inline int propagate_entity_load_avg(struct sched_entity *se)
{
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
+ return 0;
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
- if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
- unsigned long max = rq->cpu_capacity_orig;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+ if (&this_rq()->cfs == cfs_rq) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
@@ -2897,8 +3203,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
*
* See cpu_util().
*/
- cpufreq_update_util(rq_clock(rq),
- min(cfs_rq->avg.util_avg, max), max);
+ cpufreq_update_util(rq_of(cfs_rq), 0);
}
}
@@ -2931,10 +3236,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
*
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
*
- * Returns true if the load decayed or we removed utilization. It is expected
- * that one calls update_tg_load_avg() on this condition, but after you've
- * modified the cfs_rq avg (attach/detach), such that we propagate the new
- * avg up.
+ * Returns true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
*/
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -2947,6 +3252,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
sub_positive(&sa->load_avg, r);
sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
removed_load = 1;
+ set_tg_cfs_propagate(cfs_rq);
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
@@ -2954,6 +3260,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
removed_util = 1;
+ set_tg_cfs_propagate(cfs_rq);
}
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2970,23 +3277,35 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
return decayed || removed_load;
}
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG 0x1
+#define SKIP_AGE_LOAD 0x2
+
/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline void update_load_avg(struct sched_entity *se, int flags)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
struct rq *rq = rq_of(cfs_rq);
int cpu = cpu_of(rq);
+ int decayed;
/*
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
- __update_load_avg(now, cpu, &se->avg,
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+ __update_load_avg(now, cpu, &se->avg,
se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
+ }
- if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
+ decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
+ decayed |= propagate_entity_load_avg(se);
+
+ if (decayed && (flags & UPDATE_TG))
update_tg_load_avg(cfs_rq, 0);
}
@@ -3000,31 +3319,12 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
*/
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (!sched_feat(ATTACH_AGE_LOAD))
- goto skip_aging;
-
- /*
- * If we got migrated (either between CPUs or between cgroups) we'll
- * have aged the average right before clearing @last_update_time.
- *
- * Or we're fresh through post_init_entity_util_avg().
- */
- if (se->avg.last_update_time) {
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, 0, 0, NULL);
-
- /*
- * XXX: we could have just aged the entire load away if we've been
- * absent from the fair class for too long.
- */
- }
-
-skip_aging:
se->avg.last_update_time = cfs_rq->avg.last_update_time;
cfs_rq->avg.load_avg += se->avg.load_avg;
cfs_rq->avg.load_sum += se->avg.load_sum;
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
+ set_tg_cfs_propagate(cfs_rq);
cfs_rq_util_change(cfs_rq);
}
@@ -3039,14 +3339,12 @@ skip_aging:
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ set_tg_cfs_propagate(cfs_rq);
cfs_rq_util_change(cfs_rq);
}
@@ -3056,34 +3354,20 @@ static inline void
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct sched_avg *sa = &se->avg;
- u64 now = cfs_rq_clock_task(cfs_rq);
- int migrated, decayed;
-
- migrated = !sa->last_update_time;
- if (!migrated) {
- __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
- }
-
- decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
cfs_rq->runnable_load_avg += sa->load_avg;
cfs_rq->runnable_load_sum += sa->load_sum;
- if (migrated)
+ if (!sa->last_update_time) {
attach_entity_load_avg(cfs_rq, se);
-
- if (decayed || migrated)
update_tg_load_avg(cfs_rq, 0);
+ }
}
/* Remove the runnable load generated by se from cfs_rq's runnable load average */
static inline void
dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- update_load_avg(se, 1);
-
cfs_rq->runnable_load_avg =
max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
cfs_rq->runnable_load_sum =
@@ -3112,13 +3396,25 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
#endif
/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+void sync_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+}
+
+/*
* Task first catches up with cfs_rq, and then subtract
* itself from the cfs_rq (task must be off the queue now).
*/
void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 last_update_time;
/*
* tasks cannot exit without having gone through wake_up_new_task() ->
@@ -3130,9 +3426,7 @@ void remove_entity_load_avg(struct sched_entity *se)
* calls this.
*/
- last_update_time = cfs_rq_last_update_time(cfs_rq);
-
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ sync_entity_load_avg(se);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}
@@ -3147,7 +3441,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
return cfs_rq->avg.load_avg;
}
-static int idle_balance(struct rq *this_rq);
+static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
#else /* CONFIG_SMP */
@@ -3157,12 +3451,12 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
return 0;
}
-static inline void update_load_avg(struct sched_entity *se, int not_used)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- struct rq *rq = rq_of(cfs_rq);
+#define UPDATE_TG 0x0
+#define SKIP_AGE_LOAD 0x0
- cpufreq_trigger_update(rq_clock(rq));
+static inline void update_load_avg(struct sched_entity *se, int not_used1)
+{
+ cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
}
static inline void
@@ -3176,75 +3470,13 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline int idle_balance(struct rq *rq)
+static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
{
return 0;
}
#endif /* CONFIG_SMP */
-static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHEDSTATS
- struct task_struct *tsk = NULL;
-
- if (entity_is_task(se))
- tsk = task_of(se);
-
- if (se->statistics.sleep_start) {
- u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
-
- if ((s64)delta < 0)
- delta = 0;
-
- if (unlikely(delta > se->statistics.sleep_max))
- se->statistics.sleep_max = delta;
-
- se->statistics.sleep_start = 0;
- se->statistics.sum_sleep_runtime += delta;
-
- if (tsk) {
- account_scheduler_latency(tsk, delta >> 10, 1);
- trace_sched_stat_sleep(tsk, delta);
- }
- }
- if (se->statistics.block_start) {
- u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
-
- if ((s64)delta < 0)
- delta = 0;
-
- if (unlikely(delta > se->statistics.block_max))
- se->statistics.block_max = delta;
-
- se->statistics.block_start = 0;
- se->statistics.sum_sleep_runtime += delta;
-
- if (tsk) {
- if (tsk->in_iowait) {
- se->statistics.iowait_sum += delta;
- se->statistics.iowait_count++;
- trace_sched_stat_iowait(tsk, delta);
- }
-
- trace_sched_stat_blocked(tsk, delta);
-
- /*
- * Blocking time is in units of nanosecs, so shift by
- * 20 to get a milliseconds-range estimation of the
- * amount of time that the task spent sleeping:
- */
- if (unlikely(prof_on == SLEEP_PROFILING)) {
- profile_hits(SLEEP_PROFILING,
- (void *)get_wchan(tsk),
- delta >> 20);
- }
- account_scheduler_latency(tsk, delta >> 10, 0);
- }
- }
-#endif
-}
-
static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
@@ -3254,7 +3486,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
d = -d;
if (d > 3*sysctl_sched_latency)
- schedstat_inc(cfs_rq, nr_spread_over);
+ schedstat_inc(cfs_rq->nr_spread_over);
#endif
}
@@ -3367,21 +3599,25 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
+ /*
+ * When enqueuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Add its load to cfs_rq->runnable_avg
+ * - For group_entity, update its weight to reflect the new share of
+ * its group cfs_rq
+ * - Add its new weight to cfs_rq->load.weight
+ */
+ update_load_avg(se, UPDATE_TG);
enqueue_entity_load_avg(cfs_rq, se);
+ update_cfs_shares(se);
account_entity_enqueue(cfs_rq, se);
- update_cfs_shares(cfs_rq);
- if (flags & ENQUEUE_WAKEUP) {
+ if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0);
- if (schedstat_enabled())
- enqueue_sleeper(cfs_rq, se);
- }
check_schedstat_required();
- if (schedstat_enabled()) {
- update_stats_enqueue(cfs_rq, se);
- check_spread(cfs_rq, se);
- }
+ update_stats_enqueue(cfs_rq, se, flags);
+ check_spread(cfs_rq, se);
if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
@@ -3446,10 +3682,19 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+
+ /*
+ * When dequeuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Substract its load from the cfs_rq->runnable_avg.
+ * - Substract its previous weight from cfs_rq->load.weight.
+ * - For group entity, update its weight to reflect the new share
+ * of its group cfs_rq.
+ */
+ update_load_avg(se, UPDATE_TG);
dequeue_entity_load_avg(cfs_rq, se);
- if (schedstat_enabled())
- update_stats_dequeue(cfs_rq, se, flags);
+ update_stats_dequeue(cfs_rq, se, flags);
clear_buddies(cfs_rq, se);
@@ -3459,9 +3704,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
account_entity_dequeue(cfs_rq, se);
/*
- * Normalize the entity after updating the min_vruntime because the
- * update can refer to the ->curr item and we need to reflect this
- * movement in our normalized position.
+ * Normalize after update_curr(); which will also have moved
+ * min_vruntime if @se is the one holding it back. But before doing
+ * update_min_vruntime() again, which will discount @se's position and
+ * can move min_vruntime forward still more.
*/
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
@@ -3469,8 +3715,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
- update_min_vruntime(cfs_rq);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
+
+ /*
+ * Now advance min_vruntime if @se was the entity holding it back,
+ * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
+ * put back on, and if we advance min_vruntime, we'll be placed back
+ * further than we started -- ie. we'll be penalized.
+ */
+ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+ update_min_vruntime(cfs_rq);
}
/*
@@ -3523,25 +3777,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
- if (schedstat_enabled())
- update_stats_wait_end(cfs_rq, se);
+ update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
- update_load_avg(se, 1);
+ update_load_avg(se, UPDATE_TG);
}
update_stats_curr_start(cfs_rq, se);
cfs_rq->curr = se;
-#ifdef CONFIG_SCHEDSTATS
+
/*
* Track our maximum slice length, if the CPU's load is at
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
- se->statistics.slice_max = max(se->statistics.slice_max,
- se->sum_exec_runtime - se->prev_sum_exec_runtime);
+ schedstat_set(se->statistics.slice_max,
+ max((u64)schedstat_val(se->statistics.slice_max),
+ se->sum_exec_runtime - se->prev_sum_exec_runtime));
}
-#endif
+
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
@@ -3620,13 +3874,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
- if (schedstat_enabled()) {
- check_spread(cfs_rq, prev);
- if (prev->on_rq)
- update_stats_wait_start(cfs_rq, prev);
- }
+ check_spread(cfs_rq, prev);
if (prev->on_rq) {
+ update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
@@ -3646,8 +3897,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_load_avg(curr, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(curr, UPDATE_TG);
+ update_cfs_shares(curr);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -4456,9 +4707,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- WARN_ON(task_rq(p) != rq);
+ SCHED_WARN_ON(task_rq(p) != rq);
- if (cfs_rq->nr_running > 1) {
+ if (rq->cfs.h_nr_running > 1) {
u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
s64 delta = slice - ran;
@@ -4509,6 +4760,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+ /*
+ * If in_iowait is set, the code below may not trigger any cpufreq
+ * utilization updates, so do it here explicitly with the IOWAIT flag
+ * passed.
+ */
+ if (p->in_iowait)
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+
for_each_sched_entity(se) {
if (se->on_rq)
break;
@@ -4535,8 +4794,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
}
if (!se)
@@ -4594,8 +4853,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
}
if (!se)
@@ -4605,6 +4864,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
+
+/* Working cpumask for: load_balance, load_balance_newidle. */
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
+
#ifdef CONFIG_NO_HZ_COMMON
/*
* per rq 'load' arrray crap; XXX kill this.
@@ -5006,9 +5270,9 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
* wl = S * s'_i; see (2)
*/
if (W > 0 && w < W)
- wl = (w * (long)tg->shares) / W;
+ wl = (w * (long)scale_load_down(tg->shares)) / W;
else
- wl = tg->shares;
+ wl = scale_load_down(tg->shares);
/*
* Per the above, wl is the new se->load.weight value; since
@@ -5091,18 +5355,18 @@ static int wake_wide(struct task_struct *p)
return 1;
}
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+ int prev_cpu, int sync)
{
s64 this_load, load;
s64 this_eff_load, prev_eff_load;
- int idx, this_cpu, prev_cpu;
+ int idx, this_cpu;
struct task_group *tg;
unsigned long weight;
int balanced;
idx = sd->wake_idx;
this_cpu = smp_processor_id();
- prev_cpu = task_cpu(p);
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
@@ -5146,17 +5410,25 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
balanced = this_eff_load <= prev_eff_load;
- schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
+ schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (!balanced)
return 0;
- schedstat_inc(sd, ttwu_move_affine);
- schedstat_inc(p, se.statistics.nr_wakeups_affine);
+ schedstat_inc(sd->ttwu_move_affine);
+ schedstat_inc(p->se.statistics.nr_wakeups_affine);
return 1;
}
+static inline int task_util(struct task_struct *p);
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+{
+ return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+}
+
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
@@ -5166,15 +5438,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
- unsigned long min_load = ULONG_MAX, this_load = 0;
+ struct sched_group *most_spare_sg = NULL;
+ unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
+ unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
+ unsigned long most_spare = 0, this_spare = 0;
int load_idx = sd->forkexec_idx;
- int imbalance = 100 + (sd->imbalance_pct-100)/2;
+ int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
+ unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
+ (sd->imbalance_pct-100) / 100;
if (sd_flag & SD_BALANCE_WAKE)
load_idx = sd->wake_idx;
do {
- unsigned long load, avg_load;
+ unsigned long load, avg_load, runnable_load;
+ unsigned long spare_cap, max_spare_cap;
int local_group;
int i;
@@ -5186,8 +5464,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
- /* Tally up the load of all CPUs in the group */
+ /*
+ * Tally up the load of all CPUs in the group and find
+ * the group containing the CPU with most spare capacity.
+ */
avg_load = 0;
+ runnable_load = 0;
+ max_spare_cap = 0;
for_each_cpu(i, sched_group_cpus(group)) {
/* Bias balancing toward cpus of our domain */
@@ -5196,22 +5479,84 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
else
load = target_load(i, load_idx);
- avg_load += load;
+ runnable_load += load;
+
+ avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
+
+ spare_cap = capacity_spare_wake(i, p);
+
+ if (spare_cap > max_spare_cap)
+ max_spare_cap = spare_cap;
}
/* Adjust by relative CPU capacity of the group */
- avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
+ avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
+ group->sgc->capacity;
+ runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
+ group->sgc->capacity;
if (local_group) {
- this_load = avg_load;
- } else if (avg_load < min_load) {
- min_load = avg_load;
- idlest = group;
+ this_runnable_load = runnable_load;
+ this_avg_load = avg_load;
+ this_spare = max_spare_cap;
+ } else {
+ if (min_runnable_load > (runnable_load + imbalance)) {
+ /*
+ * The runnable load is significantly smaller
+ * so we can pick this new cpu
+ */
+ min_runnable_load = runnable_load;
+ min_avg_load = avg_load;
+ idlest = group;
+ } else if ((runnable_load < (min_runnable_load + imbalance)) &&
+ (100*min_avg_load > imbalance_scale*avg_load)) {
+ /*
+ * The runnable loads are close so take the
+ * blocked load into account through avg_load.
+ */
+ min_avg_load = avg_load;
+ idlest = group;
+ }
+
+ if (most_spare < max_spare_cap) {
+ most_spare = max_spare_cap;
+ most_spare_sg = group;
+ }
}
} while (group = group->next, group != sd->groups);
- if (!idlest || 100*this_load < imbalance*min_load)
+ /*
+ * The cross-over point between using spare capacity or least load
+ * is too conservative for high utilization tasks on partially
+ * utilized systems if we require spare_capacity > task_util(p),
+ * so we allow for some task stuffing by using
+ * spare_capacity > task_util(p)/2.
+ *
+ * Spare capacity can't be used for fork because the utilization has
+ * not been set yet, we must first select a rq to compute the initial
+ * utilization.
+ */
+ if (sd_flag & SD_BALANCE_FORK)
+ goto skip_spare;
+
+ if (this_spare > task_util(p) / 2 &&
+ imbalance_scale*this_spare > 100*most_spare)
+ return NULL;
+
+ if (most_spare > task_util(p) / 2)
+ return most_spare_sg;
+
+skip_spare:
+ if (!idlest)
return NULL;
+
+ if (min_runnable_load > (this_runnable_load + imbalance))
+ return NULL;
+
+ if ((this_runnable_load < (min_runnable_load + imbalance)) &&
+ (100*this_avg_load < imbalance_scale*min_avg_load))
+ return NULL;
+
return idlest;
}
@@ -5228,6 +5573,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
int shallowest_idle_cpu = -1;
int i;
+ /* Check if we have any choice: */
+ if (group->group_weight == 1)
+ return cpumask_first(sched_group_cpus(group));
+
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
if (idle_cpu(i)) {
@@ -5265,64 +5614,242 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
}
/*
- * Try and locate an idle CPU in the sched_domain.
+ * Implement a for_each_cpu() variant that starts the scan at a given cpu
+ * (@start), and wraps around.
+ *
+ * This is used to scan for idle CPUs; such that not all CPUs looking for an
+ * idle CPU find the same CPU. The down-side is that tasks tend to cycle
+ * through the LLC domain.
+ *
+ * Especially tbench is found sensitive to this.
*/
-static int select_idle_sibling(struct task_struct *p, int target)
+
+static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
+{
+ int next;
+
+again:
+ next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
+
+ if (*wrapped) {
+ if (next >= start)
+ return nr_cpumask_bits;
+ } else {
+ if (next >= nr_cpumask_bits) {
+ *wrapped = 1;
+ n = -1;
+ goto again;
+ }
+ }
+
+ return next;
+}
+
+#define for_each_cpu_wrap(cpu, mask, start, wrap) \
+ for ((wrap) = 0, (cpu) = (start)-1; \
+ (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
+ (cpu) < nr_cpumask_bits; )
+
+#ifdef CONFIG_SCHED_SMT
+
+static inline void set_idle_cores(int cpu, int val)
+{
+ struct sched_domain_shared *sds;
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds)
+ WRITE_ONCE(sds->has_idle_cores, val);
+}
+
+static inline bool test_idle_cores(int cpu, bool def)
+{
+ struct sched_domain_shared *sds;
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds)
+ return READ_ONCE(sds->has_idle_cores);
+
+ return def;
+}
+
+/*
+ * Scans the local SMT mask to see if the entire core is idle, and records this
+ * information in sd_llc_shared->has_idle_cores.
+ *
+ * Since SMT siblings share all cache levels, inspecting this limited remote
+ * state should be fairly cheap.
+ */
+void __update_idle_core(struct rq *rq)
+{
+ int core = cpu_of(rq);
+ int cpu;
+
+ rcu_read_lock();
+ if (test_idle_cores(core, true))
+ goto unlock;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ if (cpu == core)
+ continue;
+
+ if (!idle_cpu(cpu))
+ goto unlock;
+ }
+
+ set_idle_cores(core, 1);
+unlock:
+ rcu_read_unlock();
+}
+
+/*
+ * Scan the entire LLC domain for idle cores; this dynamically switches off if
+ * there are no idle cores left in the system; tracked through
+ * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ */
+static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ int core, cpu, wrap;
+
+ if (!static_branch_likely(&sched_smt_present))
+ return -1;
+
+ if (!test_idle_cores(target, false))
+ return -1;
+
+ cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+
+ for_each_cpu_wrap(core, cpus, target, wrap) {
+ bool idle = true;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ cpumask_clear_cpu(cpu, cpus);
+ if (!idle_cpu(cpu))
+ idle = false;
+ }
+
+ if (idle)
+ return core;
+ }
+
+ /*
+ * Failed to find an idle core; stop looking for one.
+ */
+ set_idle_cores(target, 0);
+
+ return -1;
+}
+
+/*
+ * Scan the local SMT mask for idle CPUs.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ int cpu;
+
+ if (!static_branch_likely(&sched_smt_present))
+ return -1;
+
+ for_each_cpu(cpu, cpu_smt_mask(target)) {
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(cpu))
+ return cpu;
+ }
+
+ return -1;
+}
+
+#else /* CONFIG_SCHED_SMT */
+
+static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ return -1;
+}
+
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ return -1;
+}
+
+#endif /* CONFIG_SCHED_SMT */
+
+/*
+ * Scan the LLC domain for idle CPUs; this is dynamically regulated by
+ * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
+ * average idle time for this rq (as found in rq->avg_idle).
+ */
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ struct sched_domain *this_sd;
+ u64 avg_cost, avg_idle = this_rq()->avg_idle;
+ u64 time, cost;
+ s64 delta;
+ int cpu, wrap;
+
+ this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+ if (!this_sd)
+ return -1;
+
+ avg_cost = this_sd->avg_scan_cost;
+
+ /*
+ * Due to large variance we need a large fuzz factor; hackbench in
+ * particularly is sensitive here.
+ */
+ if ((avg_idle / 512) < avg_cost)
+ return -1;
+
+ time = local_clock();
+
+ for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(cpu))
+ break;
+ }
+
+ time = local_clock() - time;
+ cost = this_sd->avg_scan_cost;
+ delta = (s64)(time - cost) / 8;
+ this_sd->avg_scan_cost += delta;
+
+ return cpu;
+}
+
+/*
+ * Try and locate an idle core/thread in the LLC cache domain.
+ */
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
- struct sched_group *sg;
- int i = task_cpu(p);
+ int i;
if (idle_cpu(target))
return target;
/*
- * If the prevous cpu is cache affine and idle, don't be stupid.
+ * If the previous cpu is cache affine and idle, don't be stupid.
*/
- if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
- return i;
+ if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+ return prev;
- /*
- * Otherwise, iterate the domains and find an eligible idle cpu.
- *
- * A completely idle sched group at higher domains is more
- * desirable than an idle group at a lower level, because lower
- * domains have smaller groups and usually share hardware
- * resources which causes tasks to contend on them, e.g. x86
- * hyperthread siblings in the lowest domain (SMT) can contend
- * on the shared cpu pipeline.
- *
- * However, while we prefer idle groups at higher domains
- * finding an idle cpu at the lowest domain is still better than
- * returning 'target', which we've already established, isn't
- * idle.
- */
sd = rcu_dereference(per_cpu(sd_llc, target));
- for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
-
- /* Ensure the entire group is idle */
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
- goto next;
- }
+ if (!sd)
+ return target;
+
+ i = select_idle_core(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+
+ i = select_idle_cpu(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+
+ i = select_idle_smt(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
- /*
- * It doesn't matter which cpu we pick, the
- * whole group is idle.
- */
- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
- }
-done:
return target;
}
@@ -5360,6 +5887,53 @@ static int cpu_util(int cpu)
return (util >= capacity) ? capacity : util;
}
+static inline int task_util(struct task_struct *p)
+{
+ return p->se.avg.util_avg;
+}
+
+/*
+ * cpu_util_wake: Compute cpu utilization with any contributions from
+ * the waking task p removed.
+ */
+static int cpu_util_wake(int cpu, struct task_struct *p)
+{
+ unsigned long util, capacity;
+
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+ return cpu_util(cpu);
+
+ capacity = capacity_orig_of(cpu);
+ util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
+
+ return (util >= capacity) ? capacity : util;
+}
+
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+ long min_cap, max_cap;
+
+ min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+ max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
+
+ /* Minimum capacity is close to max, no need to abort wake_affine */
+ if (max_cap - min_cap < max_cap >> 3)
+ return 0;
+
+ /* Bring task utilization in sync with prev_cpu */
+ sync_entity_load_avg(&p->se);
+
+ return min_cap * 1024 < task_util(p) * capacity_margin;
+}
+
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5383,7 +5957,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
- want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+ && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
}
rcu_read_lock();
@@ -5409,13 +5984,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (affine_sd) {
sd = NULL; /* Prefer wake_affine over balance flags */
- if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
new_cpu = cpu;
}
if (!sd) {
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
- new_cpu = select_idle_sibling(p, new_cpu);
+ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
} else while (sd) {
struct sched_group *group;
@@ -5672,7 +6247,7 @@ preempt:
}
static struct task_struct *
-pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
@@ -5779,15 +6354,8 @@ simple:
return p;
idle:
- /*
- * This is OK, because current is on_cpu, which avoids it being picked
- * for load-balance and preemption/IRQs are still disabled avoiding
- * further scheduler activity on it and we're being very careful to
- * re-start the picking loop.
- */
- lockdep_unpin_lock(&rq->lock, cookie);
- new_tasks = idle_balance(rq);
- lockdep_repin_lock(&rq->lock, cookie);
+ new_tasks = idle_balance(rq, rf);
+
/*
* Because idle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
@@ -5939,7 +6507,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* The adjacency matrix of the resulting graph is given by:
*
- * log_2 n
+ * log_2 n
* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
* k = 0
*
@@ -5985,7 +6553,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* [XXX write more on how we solve this.. _after_ merging pjt's patches that
* rewrite all of this once again.]
- */
+ */
static unsigned long __read_mostly max_load_balance_interval = HZ/10;
@@ -6133,7 +6701,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
int cpu;
- schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+ schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
env->flags |= LBF_SOME_PINNED;
@@ -6164,7 +6732,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
env->flags &= ~LBF_ALL_PINNED;
if (task_running(env->src_rq, p)) {
- schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+ schedstat_inc(p->se.statistics.nr_failed_migrations_running);
return 0;
}
@@ -6181,13 +6749,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (tsk_cache_hot <= 0 ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (tsk_cache_hot == 1) {
- schedstat_inc(env->sd, lb_hot_gained[env->idle]);
- schedstat_inc(p, se.statistics.nr_forced_migrations);
+ schedstat_inc(env->sd->lb_hot_gained[env->idle]);
+ schedstat_inc(p->se.statistics.nr_forced_migrations);
}
return 1;
}
- schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+ schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
return 0;
}
@@ -6227,7 +6795,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
* so we can safely collect stats here rather than
* inside detach_tasks().
*/
- schedstat_inc(env->sd, lb_gained[env->idle]);
+ schedstat_inc(env->sd->lb_gained[env->idle]);
return p;
}
return NULL;
@@ -6319,7 +6887,7 @@ next:
* so we can safely collect detach_one_task() stats here rather
* than inside detach_one_task().
*/
- schedstat_add(env->sd, lb_gained[env->idle], detached);
+ schedstat_add(env->sd->lb_gained[env->idle], detached);
return detached;
}
@@ -6390,6 +6958,10 @@ static void update_blocked_averages(int cpu)
if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
update_tg_load_avg(cfs_rq, 0);
+
+ /* Propagate pending load changes to the parent */
+ if (cfs_rq->tg->se[cpu])
+ update_load_avg(cfs_rq->tg->se[cpu], 0);
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -6594,13 +7166,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
+ sdg->sgc->min_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity;
+ unsigned long capacity, min_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -6613,6 +7186,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
capacity = 0;
+ min_capacity = ULONG_MAX;
if (child->flags & SD_OVERLAP) {
/*
@@ -6637,26 +7211,31 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
*/
if (unlikely(!rq->sd)) {
capacity += capacity_of(cpu);
- continue;
+ } else {
+ sgc = rq->sd->groups->sgc;
+ capacity += sgc->capacity;
}
- sgc = rq->sd->groups->sgc;
- capacity += sgc->capacity;
+ min_capacity = min(capacity, min_capacity);
}
} else {
/*
* !SD_OVERLAP domains can assume that child groups
* span the current group.
- */
+ */
group = child->groups;
do {
- capacity += group->sgc->capacity;
+ struct sched_group_capacity *sgc = group->sgc;
+
+ capacity += sgc->capacity;
+ min_capacity = min(sgc->min_capacity, min_capacity);
group = group->next;
} while (group != child->groups);
}
sdg->sgc->capacity = capacity;
+ sdg->sgc->min_capacity = min_capacity;
}
/*
@@ -6679,8 +7258,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
* Something like:
*
- * { 0 1 2 3 } { 4 5 6 7 }
- * * * * *
+ * { 0 1 2 3 } { 4 5 6 7 }
+ * * * * *
*
* If we were to balance group-wise we'd place two tasks in the first group and
* two tasks in the second group. Clearly this is undesired as it will overload
@@ -6751,6 +7330,17 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
return false;
}
+/*
+ * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-CPU capacity than sched_group ref.
+ */
+static inline bool
+group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+ return sg->sgc->min_capacity * capacity_margin <
+ ref->sgc->min_capacity * 1024;
+}
+
static inline enum
group_type group_classify(struct sched_group *group,
struct sg_lb_stats *sgs)
@@ -6854,6 +7444,20 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (sgs->avg_load <= busiest->avg_load)
return false;
+ if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+ goto asym_packing;
+
+ /*
+ * Candidate sg has no more than one task per CPU and
+ * has higher per-CPU capacity. Migrating tasks to less
+ * capable CPUs may harm throughput. Maximize throughput,
+ * power/energy consequences are not considered.
+ */
+ if (sgs->sum_nr_running <= sgs->group_weight &&
+ group_smaller_cpu_capacity(sds->local, sg))
+ return false;
+
+asym_packing:
/* This is the busiest node in its class. */
if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
@@ -6862,16 +7466,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (env->idle == CPU_NOT_IDLE)
return true;
/*
- * ASYM_PACKING needs to move all the work to the lowest
- * numbered CPUs in the group, therefore mark all groups
- * higher than ourself as busy.
+ * ASYM_PACKING needs to move all the work to the highest
+ * prority CPUs in the group, therefore mark all groups
+ * of lower priority than ourself as busy.
*/
- if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
+ if (sgs->sum_nr_running &&
+ sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
if (!sds->busiest)
return true;
- /* Prefer to move from highest possible cpu's work */
- if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
+ /* Prefer to move from lowest priority cpu's work */
+ if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
+ sg->asym_prefer_cpu))
return true;
}
@@ -7023,8 +7629,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
if (!sds->busiest)
return 0;
- busiest_cpu = group_first_cpu(sds->busiest);
- if (env->dst_cpu > busiest_cpu)
+ busiest_cpu = sds->busiest->asym_prefer_cpu;
+ if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
return 0;
env->imbalance = DIV_ROUND_CLOSEST(
@@ -7147,7 +7753,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
if (load_above_capacity > busiest->group_capacity) {
load_above_capacity -= busiest->group_capacity;
- load_above_capacity *= NICE_0_LOAD;
+ load_above_capacity *= scale_load_down(NICE_0_LOAD);
load_above_capacity /= busiest->group_capacity;
} else
load_above_capacity = ~0UL;
@@ -7354,9 +7960,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
#define MAX_PINNED_INTERVAL 512
-/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-
static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
@@ -7365,10 +7968,11 @@ static int need_active_balance(struct lb_env *env)
/*
* ASYM_PACKING needs to force migrate tasks from busy but
- * higher numbered CPUs in order to pack all tasks in the
- * lowest numbered CPUs.
+ * lower priority CPUs in order to pack all tasks in the
+ * highest priority CPUs.
*/
- if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
+ if ((sd->flags & SD_ASYM_PACKING) &&
+ sched_asym_prefer(env->dst_cpu, env->src_cpu))
return 1;
}
@@ -7460,7 +8064,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
cpumask_copy(cpus, cpu_active_mask);
- schedstat_inc(sd, lb_count[idle]);
+ schedstat_inc(sd->lb_count[idle]);
redo:
if (!should_we_balance(&env)) {
@@ -7470,19 +8074,19 @@ redo:
group = find_busiest_group(&env);
if (!group) {
- schedstat_inc(sd, lb_nobusyg[idle]);
+ schedstat_inc(sd->lb_nobusyg[idle]);
goto out_balanced;
}
busiest = find_busiest_queue(&env, group);
if (!busiest) {
- schedstat_inc(sd, lb_nobusyq[idle]);
+ schedstat_inc(sd->lb_nobusyq[idle]);
goto out_balanced;
}
BUG_ON(busiest == env.dst_rq);
- schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+ schedstat_add(sd->lb_imbalance[idle], env.imbalance);
env.src_cpu = busiest->cpu;
env.src_rq = busiest;
@@ -7500,6 +8104,7 @@ redo:
more_balance:
raw_spin_lock_irqsave(&busiest->lock, flags);
+ update_rq_clock(busiest);
/*
* cur_ld_moved - load moved in current iteration
@@ -7589,7 +8194,7 @@ more_balance:
}
if (!ld_moved) {
- schedstat_inc(sd, lb_failed[idle]);
+ schedstat_inc(sd->lb_failed[idle]);
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
@@ -7672,7 +8277,7 @@ out_all_pinned:
* we can't migrate them. Let the imbalance flag set so parent level
* can try to migrate them.
*/
- schedstat_inc(sd, lb_balanced[idle]);
+ schedstat_inc(sd->lb_balanced[idle]);
sd->nr_balance_failed = 0;
@@ -7704,11 +8309,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
}
static inline void
-update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
{
unsigned long interval, next;
- interval = get_sd_balance_interval(sd, cpu_busy);
+ /* used by idle balance, so cpu_busy = 0 */
+ interval = get_sd_balance_interval(sd, 0);
next = sd->last_balance + interval;
if (time_after(*next_balance, next))
@@ -7719,7 +8325,7 @@ update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_b
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
-static int idle_balance(struct rq *this_rq)
+static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
@@ -7733,12 +8339,20 @@ static int idle_balance(struct rq *this_rq)
*/
this_rq->idle_stamp = rq_clock(this_rq);
+ /*
+ * This is OK, because current is on_cpu, which avoids it being picked
+ * for load-balance and preemption/IRQs are still disabled avoiding
+ * further scheduler activity on it and we're being very careful to
+ * re-start the picking loop.
+ */
+ rq_unpin_lock(this_rq, rf);
+
if (this_rq->avg_idle < sysctl_sched_migration_cost ||
!this_rq->rd->overload) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
- update_next_balance(sd, 0, &next_balance);
+ update_next_balance(sd, &next_balance);
rcu_read_unlock();
goto out;
@@ -7756,7 +8370,7 @@ static int idle_balance(struct rq *this_rq)
continue;
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
- update_next_balance(sd, 0, &next_balance);
+ update_next_balance(sd, &next_balance);
break;
}
@@ -7774,7 +8388,7 @@ static int idle_balance(struct rq *this_rq)
curr_cost += domain_cost;
}
- update_next_balance(sd, 0, &next_balance);
+ update_next_balance(sd, &next_balance);
/*
* Stop searching for tasks to pull if there are
@@ -7810,6 +8424,8 @@ out:
if (pulled_task)
this_rq->idle_stamp = 0;
+ rq_repin_lock(this_rq, rf);
+
return pulled_task;
}
@@ -7864,15 +8480,16 @@ static int active_load_balance_cpu_stop(void *data)
.idle = CPU_IDLE,
};
- schedstat_inc(sd, alb_count);
+ schedstat_inc(sd->alb_count);
+ update_rq_clock(busiest_rq);
p = detach_one_task(&env);
if (p) {
- schedstat_inc(sd, alb_pushed);
+ schedstat_inc(sd->alb_pushed);
/* Active balancing done, reset the failure counter. */
sd->nr_balance_failed = 0;
} else {
- schedstat_inc(sd, alb_failed);
+ schedstat_inc(sd->alb_failed);
}
}
rcu_read_unlock();
@@ -7964,13 +8581,13 @@ static inline void set_cpu_sd_state_busy(void)
int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
- atomic_inc(&sd->groups->sgc->nr_busy_cpus);
+ atomic_inc(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -7981,13 +8598,13 @@ void set_cpu_sd_state_idle(void)
int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
- atomic_dec(&sd->groups->sgc->nr_busy_cpus);
+ atomic_dec(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -8214,9 +8831,9 @@ end:
static inline bool nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
+ struct sched_domain_shared *sds;
struct sched_domain *sd;
- struct sched_group_capacity *sgc;
- int nr_busy, cpu = rq->cpu;
+ int nr_busy, i, cpu = rq->cpu;
bool kick = false;
if (unlikely(rq->idle_balance))
@@ -8243,11 +8860,13 @@ static inline bool nohz_kick_needed(struct rq *rq)
return true;
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
- if (sd) {
- sgc = sd->groups->sgc;
- nr_busy = atomic_read(&sgc->nr_busy_cpus);
-
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds) {
+ /*
+ * XXX: write a coherent comment on why we do this.
+ * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
+ */
+ nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
kick = true;
goto unlock;
@@ -8265,12 +8884,18 @@ static inline bool nohz_kick_needed(struct rq *rq)
}
sd = rcu_dereference(per_cpu(sd_asym, cpu));
- if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu)) {
- kick = true;
- goto unlock;
- }
+ if (sd) {
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (i == cpu ||
+ !cpumask_test_cpu(i, nohz.idle_cpus_mask))
+ continue;
+ if (sched_asym_prefer(i, cpu)) {
+ kick = true;
+ goto unlock;
+ }
+ }
+ }
unlock:
rcu_read_unlock();
return kick;
@@ -8283,7 +8908,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
* run_rebalance_domains is triggered when needed from the scheduler tick.
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
*/
-static void run_rebalance_domains(struct softirq_action *h)
+static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
{
struct rq *this_rq = this_rq();
enum cpu_idle_type idle = this_rq->idle_balance ?
@@ -8436,12 +9061,65 @@ static inline bool vruntime_normalized(struct task_struct *p)
return false;
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Propagate the changes of the sched_entity across the tg tree to make it
+ * visible to the root
+ */
+static void propagate_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+
+ /* Start to propagate at parent */
+ se = se->parent;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+
+ update_load_avg(se, UPDATE_TG);
+ }
+}
+#else
+static void propagate_entity_cfs_rq(struct sched_entity *se) { }
+#endif
+
+static void detach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /* Catch up with the cfs_rq and remove our load when we leave */
+ update_load_avg(se, 0);
+ detach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
+}
+
+static void attach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Since the real-depth could have been changed (only FAIR
+ * class maintain depth value), reset depth properly.
+ */
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+#endif
+
+ /* Synchronize entity with its cfs_rq */
+ update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+ attach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
+}
+
static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 now = cfs_rq_clock_task(cfs_rq);
- int tg_update;
if (!vruntime_normalized(p)) {
/*
@@ -8452,33 +9130,15 @@ static void detach_task_cfs_rq(struct task_struct *p)
se->vruntime -= cfs_rq->min_vruntime;
}
- /* Catch up with the cfs_rq and remove our load when we leave */
- tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
- detach_entity_load_avg(cfs_rq, se);
- if (tg_update)
- update_tg_load_avg(cfs_rq, false);
+ detach_entity_cfs_rq(se);
}
static void attach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 now = cfs_rq_clock_task(cfs_rq);
- int tg_update;
-#ifdef CONFIG_FAIR_GROUP_SCHED
- /*
- * Since the real-depth could have been changed (only FAIR
- * class maintain depth value), reset depth properly.
- */
- se->depth = se->parent ? se->parent->depth + 1 : 0;
-#endif
-
- /* Synchronize task with its cfs_rq */
- tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
- attach_entity_load_avg(cfs_rq, se);
- if (tg_update)
- update_tg_load_avg(cfs_rq, false);
+ attach_entity_cfs_rq(se);
if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime;
@@ -8532,6 +9192,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ cfs_rq->propagate_avg = 0;
+#endif
atomic_long_set(&cfs_rq->removed_load_avg, 0);
atomic_long_set(&cfs_rq->removed_util_avg, 0);
#endif
@@ -8592,7 +9255,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
- struct rq *rq;
int i;
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8607,8 +9269,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(i) {
- rq = cpu_rq(i);
-
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)
@@ -8643,7 +9303,8 @@ void online_fair_sched_group(struct task_group *tg)
se = tg->se[i];
raw_spin_lock_irq(&rq->lock);
- post_init_entity_util_avg(se);
+ update_rq_clock(rq);
+ attach_entity_cfs_rq(se);
sync_throttle(tg, i);
raw_spin_unlock_irq(&rq->lock);
}
@@ -8735,8 +9396,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
/* Possible calls to update_curr() need rq clock */
update_rq_clock(rq);
- for_each_sched_entity(se)
- update_cfs_shares(group_cfs_rq(se));
+ for_each_sched_entity(se) {
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 9fb873cfc75c..6a4bae0a649d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -16,6 +16,9 @@
#include "sched.h"
+/* Linker adds these: start and end of __cpuidle functions */
+extern char __cpuidle_text_start[], __cpuidle_text_end[];
+
/**
* sched_idle_set_state - Record idle state for the current CPU.
* @idle_state: State to record.
@@ -53,7 +56,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused)
__setup("hlt", cpu_idle_nopoll_setup);
#endif
-static inline int cpu_idle_poll(void)
+static noinline int __cpuidle cpu_idle_poll(void)
{
rcu_idle_enter();
trace_cpu_idle_rcuidle(0, smp_processor_id());
@@ -84,7 +87,7 @@ void __weak arch_cpu_idle(void)
*
* To use when the cpuidle framework cannot be used.
*/
-void default_idle_call(void)
+void __cpuidle default_idle_call(void)
{
if (current_clr_polling_and_test()) {
local_irq_enable();
@@ -161,11 +164,14 @@ static void cpuidle_idle_call(void)
* timekeeping to prevent timer interrupts from kicking us out of idle
* until a proper wakeup interrupt happens.
*/
- if (idle_should_freeze()) {
- entered_state = cpuidle_enter_freeze(drv, dev);
- if (entered_state > 0) {
- local_irq_enable();
- goto exit_idle;
+
+ if (idle_should_freeze() || dev->use_deepest_state) {
+ if (idle_should_freeze()) {
+ entered_state = cpuidle_enter_freeze(drv, dev);
+ if (entered_state > 0) {
+ local_irq_enable();
+ goto exit_idle;
+ }
}
next_state = cpuidle_find_deepest_state(drv, dev);
@@ -199,77 +205,122 @@ exit_idle:
*
* Called with polling cleared.
*/
-static void cpu_idle_loop(void)
+static void do_idle(void)
{
- int cpu = smp_processor_id();
+ /*
+ * If the arch has a polling bit, we maintain an invariant:
+ *
+ * Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
+ * rq->idle). This means that, if rq->idle has the polling bit set,
+ * then setting need_resched is guaranteed to cause the CPU to
+ * reschedule.
+ */
+
+ __current_set_polling();
+ tick_nohz_idle_enter();
+
+ while (!need_resched()) {
+ check_pgt_cache();
+ rmb();
+
+ if (cpu_is_offline(smp_processor_id())) {
+ cpuhp_report_idle_dead();
+ arch_cpu_idle_dead();
+ }
+
+ local_irq_disable();
+ arch_cpu_idle_enter();
- while (1) {
/*
- * If the arch has a polling bit, we maintain an invariant:
- *
- * Our polling bit is clear if we're not scheduled (i.e. if
- * rq->curr != rq->idle). This means that, if rq->idle has
- * the polling bit set, then setting need_resched is
- * guaranteed to cause the cpu to reschedule.
+ * In poll mode we reenable interrupts and spin. Also if we
+ * detected in the wakeup from idle path that the tick
+ * broadcast device expired for us, we don't want to go deep
+ * idle as we know that the IPI is going to arrive right away.
*/
+ if (cpu_idle_force_poll || tick_check_broadcast_expired())
+ cpu_idle_poll();
+ else
+ cpuidle_idle_call();
+ arch_cpu_idle_exit();
+ }
- __current_set_polling();
- quiet_vmstat();
- tick_nohz_idle_enter();
+ /*
+ * Since we fell out of the loop above, we know TIF_NEED_RESCHED must
+ * be set, propagate it into PREEMPT_NEED_RESCHED.
+ *
+ * This is required because for polling idle loops we will not have had
+ * an IPI to fold the state for us.
+ */
+ preempt_set_need_resched();
+ tick_nohz_idle_exit();
+ __current_clr_polling();
- while (!need_resched()) {
- check_pgt_cache();
- rmb();
+ /*
+ * We promise to call sched_ttwu_pending() and reschedule if
+ * need_resched() is set while polling is set. That means that clearing
+ * polling needs to be visible before doing these things.
+ */
+ smp_mb__after_atomic();
- if (cpu_is_offline(cpu)) {
- cpuhp_report_idle_dead();
- arch_cpu_idle_dead();
- }
+ sched_ttwu_pending();
+ schedule_preempt_disabled();
+}
- local_irq_disable();
- arch_cpu_idle_enter();
-
- /*
- * In poll mode we reenable interrupts and spin.
- *
- * Also if we detected in the wakeup from idle
- * path that the tick broadcast device expired
- * for us, we don't want to go deep idle as we
- * know that the IPI is going to arrive right
- * away
- */
- if (cpu_idle_force_poll || tick_check_broadcast_expired())
- cpu_idle_poll();
- else
- cpuidle_idle_call();
-
- arch_cpu_idle_exit();
- }
+bool cpu_in_idle(unsigned long pc)
+{
+ return pc >= (unsigned long)__cpuidle_text_start &&
+ pc < (unsigned long)__cpuidle_text_end;
+}
- /*
- * Since we fell out of the loop above, we know
- * TIF_NEED_RESCHED must be set, propagate it into
- * PREEMPT_NEED_RESCHED.
- *
- * This is required because for polling idle loops we will
- * not have had an IPI to fold the state for us.
- */
- preempt_set_need_resched();
- tick_nohz_idle_exit();
- __current_clr_polling();
+struct idle_timer {
+ struct hrtimer timer;
+ int done;
+};
- /*
- * We promise to call sched_ttwu_pending and reschedule
- * if need_resched is set while polling is set. That
- * means that clearing polling needs to be visible
- * before doing these things.
- */
- smp_mb__after_atomic();
+static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
+{
+ struct idle_timer *it = container_of(timer, struct idle_timer, timer);
- sched_ttwu_pending();
- schedule_preempt_disabled();
- }
+ WRITE_ONCE(it->done, 1);
+ set_tsk_need_resched(current);
+
+ return HRTIMER_NORESTART;
+}
+
+void play_idle(unsigned long duration_ms)
+{
+ struct idle_timer it;
+
+ /*
+ * Only FIFO tasks can disable the tick since they don't need the forced
+ * preemption.
+ */
+ WARN_ON_ONCE(current->policy != SCHED_FIFO);
+ WARN_ON_ONCE(current->nr_cpus_allowed != 1);
+ WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
+ WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
+ WARN_ON_ONCE(!duration_ms);
+
+ rcu_sleep_check();
+ preempt_disable();
+ current->flags |= PF_IDLE;
+ cpuidle_use_deepest_state(true);
+
+ it.done = 0;
+ hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ it.timer.function = idle_inject_timer_fn;
+ hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED);
+
+ while (!READ_ONCE(it.done))
+ do_idle();
+
+ cpuidle_use_deepest_state(false);
+ current->flags &= ~PF_IDLE;
+
+ preempt_fold_need_resched();
+ preempt_enable();
}
+EXPORT_SYMBOL_GPL(play_idle);
void cpu_startup_entry(enum cpuhp_state state)
{
@@ -290,5 +341,6 @@ void cpu_startup_entry(enum cpuhp_state state)
#endif
arch_cpu_idle_prepare();
cpuhp_online_idle(state);
- cpu_idle_loop();
+ while (1)
+ do_idle();
}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 2ce5458bbe1d..0c00172db63e 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -24,11 +24,11 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
}
static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
put_prev_task(rq, prev);
-
- schedstat_inc(rq, sched_goidle);
+ update_idle_core(rq);
+ schedstat_inc(rq->sched_goidle);
return rq->idle;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d5690b722691..e8836cfc4cdb 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,7 @@
#include <linux/irq_work.h>
int sched_rr_timeslice = RR_TIMESLICE;
+int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
@@ -957,9 +958,8 @@ static void update_curr_rt(struct rq *rq)
if (unlikely((s64)delta_exec <= 0))
return;
- /* Kick cpufreq (see the comment in linux/cpufreq.h). */
- if (cpu_of(rq) == smp_processor_id())
- cpufreq_trigger_update(rq_clock(rq));
+ /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -1524,7 +1524,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
}
static struct task_struct *
-pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt;
@@ -1536,9 +1536,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie coo
* disabled avoiding further scheduler activity on it and we're
* being very careful to re-start the picking loop.
*/
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, rf);
pull_rt_task(rq);
- lockdep_repin_lock(&rq->lock, cookie);
+ rq_repin_lock(rq, rf);
/*
* pull_rt_task() can drop (and re-acquire) rq->lock; this
* means a dl or stop task can slip in, in which case we need
@@ -2199,10 +2199,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
#ifdef CONFIG_SMP
if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
queue_push_tasks(rq);
-#else
+#endif /* CONFIG_SMP */
if (p->prio < rq->curr->prio)
resched_curr(rq);
-#endif /* CONFIG_SMP */
}
}
@@ -2247,6 +2246,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
}
}
+#ifdef CONFIG_POSIX_TIMERS
static void watchdog(struct rq *rq, struct task_struct *p)
{
unsigned long soft, hard;
@@ -2268,6 +2268,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
}
}
+#else
+static inline void watchdog(struct rq *rq, struct task_struct *p) { }
+#endif
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c64fc5114004..71b10a9b73cf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,7 +2,9 @@
#include <linux/sched.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
+#include <linux/u64_stats_sync.h>
#include <linux/sched/deadline.h>
+#include <linux/kernel_stat.h>
#include <linux/binfmts.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
@@ -15,6 +17,12 @@
#include "cpudeadline.h"
#include "cpuacct.h"
+#ifdef CONFIG_SCHED_DEBUG
+#define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
+#else
+#define SCHED_WARN_ON(x) ((void)(x))
+#endif
+
struct rq;
struct cpuidle_state;
@@ -215,7 +223,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
}
-extern struct mutex sched_domains_mutex;
+extern void init_dl_bw(struct dl_bw *dl_b);
#ifdef CONFIG_CGROUP_SCHED
@@ -397,6 +405,7 @@ struct cfs_rq {
unsigned long runnable_load_avg;
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
+ unsigned long propagate_avg;
#endif
atomic_long_t removed_load_avg, removed_util_avg;
#ifndef CONFIG_64BIT
@@ -532,6 +541,11 @@ struct dl_rq {
#ifdef CONFIG_SMP
+static inline bool sched_asym_prefer(int a, int b)
+{
+ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
+}
+
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
@@ -565,9 +579,18 @@ struct root_domain {
*/
cpumask_var_t rto_mask;
struct cpupri cpupri;
+
+ unsigned long max_cpu_capacity;
};
extern struct root_domain def_root_domain;
+extern struct mutex sched_domains_mutex;
+extern cpumask_var_t fallback_doms;
+extern cpumask_var_t sched_domains_tmpmask;
+
+extern void init_defrootdomain(void);
+extern int init_sched_domains(const struct cpumask *cpu_map);
+extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
#endif /* CONFIG_SMP */
@@ -597,7 +620,6 @@ struct rq {
#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
#endif /* CONFIG_SMP */
- u64 nohz_stamp;
unsigned long nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
@@ -615,6 +637,7 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
+ struct list_head *tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
@@ -629,7 +652,7 @@ struct rq {
unsigned long next_balance;
struct mm_struct *prev_mm;
- unsigned int clock_skip_update;
+ unsigned int clock_update_flags;
u64 clock;
u64 clock_task;
@@ -723,6 +746,23 @@ static inline int cpu_of(struct rq *rq)
#endif
}
+
+#ifdef CONFIG_SCHED_SMT
+
+extern struct static_key_false sched_smt_present;
+
+extern void __update_idle_core(struct rq *rq);
+
+static inline void update_idle_core(struct rq *rq)
+{
+ if (static_branch_unlikely(&sched_smt_present))
+ __update_idle_core(rq);
+}
+
+#else
+static inline void update_idle_core(struct rq *rq) { }
+#endif
+
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
@@ -736,28 +776,110 @@ static inline u64 __rq_clock_broken(struct rq *rq)
return READ_ONCE(rq->clock);
}
+/*
+ * rq::clock_update_flags bits
+ *
+ * %RQCF_REQ_SKIP - will request skipping of clock update on the next
+ * call to __schedule(). This is an optimisation to avoid
+ * neighbouring rq clock updates.
+ *
+ * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is
+ * in effect and calls to update_rq_clock() are being ignored.
+ *
+ * %RQCF_UPDATED - is a debug flag that indicates whether a call has been
+ * made to update_rq_clock() since the last time rq::lock was pinned.
+ *
+ * If inside of __schedule(), clock_update_flags will have been
+ * shifted left (a left shift is a cheap operation for the fast path
+ * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use,
+ *
+ * if (rq-clock_update_flags >= RQCF_UPDATED)
+ *
+ * to check if %RQCF_UPADTED is set. It'll never be shifted more than
+ * one position though, because the next rq_unpin_lock() will shift it
+ * back.
+ */
+#define RQCF_REQ_SKIP 0x01
+#define RQCF_ACT_SKIP 0x02
+#define RQCF_UPDATED 0x04
+
+static inline void assert_clock_updated(struct rq *rq)
+{
+ /*
+ * The only reason for not seeing a clock update since the
+ * last rq_pin_lock() is if we're currently skipping updates.
+ */
+ SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP);
+}
+
static inline u64 rq_clock(struct rq *rq)
{
lockdep_assert_held(&rq->lock);
+ assert_clock_updated(rq);
+
return rq->clock;
}
static inline u64 rq_clock_task(struct rq *rq)
{
lockdep_assert_held(&rq->lock);
+ assert_clock_updated(rq);
+
return rq->clock_task;
}
-#define RQCF_REQ_SKIP 0x01
-#define RQCF_ACT_SKIP 0x02
-
static inline void rq_clock_skip_update(struct rq *rq, bool skip)
{
lockdep_assert_held(&rq->lock);
if (skip)
- rq->clock_skip_update |= RQCF_REQ_SKIP;
+ rq->clock_update_flags |= RQCF_REQ_SKIP;
else
- rq->clock_skip_update &= ~RQCF_REQ_SKIP;
+ rq->clock_update_flags &= ~RQCF_REQ_SKIP;
+}
+
+struct rq_flags {
+ unsigned long flags;
+ struct pin_cookie cookie;
+#ifdef CONFIG_SCHED_DEBUG
+ /*
+ * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the
+ * current pin context is stashed here in case it needs to be
+ * restored in rq_repin_lock().
+ */
+ unsigned int clock_update_flags;
+#endif
+};
+
+static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
+{
+ rf->cookie = lockdep_pin_lock(&rq->lock);
+
+#ifdef CONFIG_SCHED_DEBUG
+ rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+ rf->clock_update_flags = 0;
+#endif
+}
+
+static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
+{
+#ifdef CONFIG_SCHED_DEBUG
+ if (rq->clock_update_flags > RQCF_ACT_SKIP)
+ rf->clock_update_flags = RQCF_UPDATED;
+#endif
+
+ lockdep_unpin_lock(&rq->lock, rf->cookie);
+}
+
+static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
+{
+ lockdep_repin_lock(&rq->lock, rf->cookie);
+
+#ifdef CONFIG_SCHED_DEBUG
+ /*
+ * Restore the value we stashed in @rf for this pin context.
+ */
+ rq->clock_update_flags |= rf->clock_update_flags;
+#endif
}
#ifdef CONFIG_NUMA
@@ -771,6 +893,16 @@ extern int sched_max_numa_distance;
extern bool find_numa_distance(int distance);
#endif
+#ifdef CONFIG_NUMA
+extern void sched_init_numa(void);
+extern void sched_domains_numa_masks_set(unsigned int cpu);
+extern void sched_domains_numa_masks_clear(unsigned int cpu);
+#else
+static inline void sched_init_numa(void) { }
+static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
+static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
+#endif
+
#ifdef CONFIG_NUMA_BALANCING
/* The regions in numa_faults array from task_struct */
enum numa_faults_stats {
@@ -857,8 +989,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
DECLARE_PER_CPU(struct sched_domain *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_busy);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
struct sched_group_capacity {
@@ -867,13 +999,10 @@ struct sched_group_capacity {
* CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
* for a single CPU.
*/
- unsigned int capacity;
+ unsigned long capacity;
+ unsigned long min_capacity; /* Min per-CPU capacity in group */
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
- /*
- * Number of busy cpus in this group.
- */
- atomic_t nr_busy_cpus;
unsigned long cpumask[0]; /* iteration mask */
};
@@ -884,6 +1013,7 @@ struct sched_group {
unsigned int group_weight;
struct sched_group_capacity *sgc;
+ int asym_prefer_cpu; /* cpu of highest priority in group */
/*
* The CPUs this group covers.
@@ -939,7 +1069,7 @@ static inline void sched_ttwu_pending(void) { }
#endif /* CONFIG_SMP */
#include "stats.h"
-#include "auto_group.h"
+#include "autogroup.h"
#ifdef CONFIG_CGROUP_SCHED
@@ -1000,7 +1130,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
* per-task data have been completed by this moment.
*/
smp_wmb();
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ p->cpu = cpu;
+#else
task_thread_info(p)->cpu = cpu;
+#endif
p->wake_cpu = cpu;
#endif
}
@@ -1211,7 +1345,7 @@ struct sched_class {
*/
struct task_struct * (*pick_next_task) (struct rq *rq,
struct task_struct *prev,
- struct pin_cookie cookie);
+ struct rq_flags *rf);
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
@@ -1260,6 +1394,11 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
prev->sched_class->put_prev_task(rq, prev);
}
+static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
+{
+ curr->sched_class->set_curr_task(rq);
+}
+
#define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
@@ -1290,7 +1429,7 @@ static inline void idle_set_state(struct rq *rq,
static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
- WARN_ON(!rcu_read_lock_held());
+ SCHED_WARN_ON(!rcu_read_lock_held());
return rq->idle_state;
}
#else
@@ -1462,11 +1601,6 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
static inline void sched_avg_update(struct rq *rq) { }
#endif
-struct rq_flags {
- unsigned long flags;
- struct pin_cookie cookie;
-};
-
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(rq->lock);
struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
@@ -1476,7 +1610,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
- lockdep_unpin_lock(&rq->lock, rf->cookie);
+ rq_unpin_lock(rq, rf);
raw_spin_unlock(&rq->lock);
}
@@ -1485,7 +1619,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
__releases(rq->lock)
__releases(p->pi_lock)
{
- lockdep_unpin_lock(&rq->lock, rf->cookie);
+ rq_unpin_lock(rq, rf);
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
@@ -1635,6 +1769,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__release(rq2->lock);
}
+extern void set_rq_online (struct rq *rq);
+extern void set_rq_offline(struct rq *rq);
+extern bool sched_smp_initialized;
+
#else /* CONFIG_SMP */
/*
@@ -1710,52 +1848,28 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+struct irqtime {
+ u64 tick_delta;
+ u64 irq_start_time;
+ struct u64_stats_sync sync;
+};
-DECLARE_PER_CPU(u64, cpu_hardirq_time);
-DECLARE_PER_CPU(u64, cpu_softirq_time);
-
-#ifndef CONFIG_64BIT
-DECLARE_PER_CPU(seqcount_t, irq_time_seq);
-
-static inline void irq_time_write_begin(void)
-{
- __this_cpu_inc(irq_time_seq.sequence);
- smp_wmb();
-}
-
-static inline void irq_time_write_end(void)
-{
- smp_wmb();
- __this_cpu_inc(irq_time_seq.sequence);
-}
+DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
static inline u64 irq_time_read(int cpu)
{
- u64 irq_time;
- unsigned seq;
+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
+ u64 *cpustat = kcpustat_cpu(cpu).cpustat;
+ unsigned int seq;
+ u64 total;
do {
- seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
- irq_time = per_cpu(cpu_softirq_time, cpu) +
- per_cpu(cpu_hardirq_time, cpu);
- } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
-
- return irq_time;
-}
-#else /* CONFIG_64BIT */
-static inline void irq_time_write_begin(void)
-{
-}
+ seq = __u64_stats_fetch_begin(&irqtime->sync);
+ total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ];
+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
-static inline void irq_time_write_end(void)
-{
-}
-
-static inline u64 irq_time_read(int cpu)
-{
- return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+ return total;
}
-#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ
@@ -1763,27 +1877,13 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
/**
* cpufreq_update_util - Take a note about CPU utilization changes.
- * @time: Current time.
- * @util: Current utilization.
- * @max: Utilization ceiling.
+ * @rq: Runqueue to carry out the update for.
+ * @flags: Update reason flags.
*
- * This function is called by the scheduler on every invocation of
- * update_load_avg() on the CPU whose utilization is being updated.
+ * This function is called by the scheduler on the CPU whose utilization is
+ * being updated.
*
* It can only be called from RCU-sched read-side critical sections.
- */
-static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
-{
- struct update_util_data *data;
-
- data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
- if (data)
- data->func(data, time, util, max);
-}
-
-/**
- * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
- * @time: Current time.
*
* The way cpufreq is currently arranged requires it to evaluate the CPU
* performance state (frequency/voltage) on a regular basis to prevent it from
@@ -1797,13 +1897,23 @@ static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned lo
* but that really is a band-aid. Going forward it should be replaced with
* solutions targeted more specifically at RT and DL tasks.
*/
-static inline void cpufreq_trigger_update(u64 time)
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
+{
+ struct update_util_data *data;
+
+ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+ if (data)
+ data->func(data, rq_clock(rq), flags);
+}
+
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
{
- cpufreq_update_util(time, ULONG_MAX, 0);
+ if (cpu_of(rq) == smp_processor_id())
+ cpufreq_update_util(rq, flags);
}
#else
-static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
-static inline void cpufreq_trigger_update(u64 time) {}
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef arch_scale_freq_capacity
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 78955cbea31c..bf0da0aa0a14 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,11 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
if (rq)
rq->rq_sched_info.run_delay += delta;
}
-# define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
-# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
-# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
-# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
-# define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0)
+#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
+#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
+#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
+#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
+#define schedstat_val(var) (var)
+#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
#else /* !CONFIG_SCHEDSTATS */
static inline void
@@ -45,12 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{}
-# define schedstat_enabled() 0
-# define schedstat_inc(rq, field) do { } while (0)
-# define schedstat_add(rq, field, amt) do { } while (0)
-# define schedstat_set(var, val) do { } while (0)
-# define schedstat_val(rq, field) 0
-#endif
+#define schedstat_enabled() 0
+#define schedstat_inc(var) do { } while (0)
+#define schedstat_add(var, amt) do { } while (0)
+#define schedstat_set(var, val) do { } while (0)
+#define schedstat_val(var) 0
+#define schedstat_val_or_zero(var) 0
+#endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_SCHED_INFO
static inline void sched_info_reset_dequeued(struct task_struct *t)
@@ -170,18 +172,19 @@ sched_info_switch(struct rq *rq,
*/
/**
- * cputimer_running - return true if cputimer is running
+ * get_running_cputimer - return &tsk->signal->cputimer if cputimer is running
*
* @tsk: Pointer to target task.
*/
-static inline bool cputimer_running(struct task_struct *tsk)
-
+#ifdef CONFIG_POSIX_TIMERS
+static inline
+struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
/* Check if cputimer isn't running. This is accessed without locking. */
if (!READ_ONCE(cputimer->running))
- return false;
+ return NULL;
/*
* After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
@@ -198,10 +201,17 @@ static inline bool cputimer_running(struct task_struct *tsk)
* clock delta is behind the expiring timer value.
*/
if (unlikely(!tsk->sighand))
- return false;
+ return NULL;
- return true;
+ return cputimer;
+}
+#else
+static inline
+struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
+{
+ return NULL;
}
+#endif
/**
* account_group_user_time - Maintain utime for a thread group.
@@ -214,11 +224,11 @@ static inline bool cputimer_running(struct task_struct *tsk)
* running CPU and update the utime field there.
*/
static inline void account_group_user_time(struct task_struct *tsk,
- cputime_t cputime)
+ u64 cputime)
{
- struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+ struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
- if (!cputimer_running(tsk))
+ if (!cputimer)
return;
atomic64_add(cputime, &cputimer->cputime_atomic.utime);
@@ -235,11 +245,11 @@ static inline void account_group_user_time(struct task_struct *tsk,
* running CPU and update the stime field there.
*/
static inline void account_group_system_time(struct task_struct *tsk,
- cputime_t cputime)
+ u64 cputime)
{
- struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+ struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
- if (!cputimer_running(tsk))
+ if (!cputimer)
return;
atomic64_add(cputime, &cputimer->cputime_atomic.stime);
@@ -258,9 +268,9 @@ static inline void account_group_system_time(struct task_struct *tsk,
static inline void account_group_exec_runtime(struct task_struct *tsk,
unsigned long long ns)
{
- struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+ struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
- if (!cputimer_running(tsk))
+ if (!cputimer)
return;
atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 604297a08b3a..9f69fb630853 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -24,7 +24,7 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
}
static struct task_struct *
-pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct task_struct *stop = rq->stop;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
new file mode 100644
index 000000000000..1b0b4fb12837
--- /dev/null
+++ b/kernel/sched/topology.c
@@ -0,0 +1,1658 @@
+/*
+ * Scheduler topology setup/handling methods
+ */
+#include <linux/sched.h>
+#include <linux/mutex.h>
+
+#include "sched.h"
+
+DEFINE_MUTEX(sched_domains_mutex);
+
+/* Protected by sched_domains_mutex: */
+cpumask_var_t sched_domains_tmpmask;
+
+#ifdef CONFIG_SCHED_DEBUG
+
+static __read_mostly int sched_debug_enabled;
+
+static int __init sched_debug_setup(char *str)
+{
+ sched_debug_enabled = 1;
+
+ return 0;
+}
+early_param("sched_debug", sched_debug_setup);
+
+static inline bool sched_debug(void)
+{
+ return sched_debug_enabled;
+}
+
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
+ struct cpumask *groupmask)
+{
+ struct sched_group *group = sd->groups;
+
+ cpumask_clear(groupmask);
+
+ printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+
+ if (!(sd->flags & SD_LOAD_BALANCE)) {
+ printk("does not load-balance\n");
+ if (sd->parent)
+ printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+ " has parent");
+ return -1;
+ }
+
+ printk(KERN_CONT "span %*pbl level %s\n",
+ cpumask_pr_args(sched_domain_span(sd)), sd->name);
+
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+ printk(KERN_ERR "ERROR: domain->span does not contain "
+ "CPU%d\n", cpu);
+ }
+ if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
+ printk(KERN_ERR "ERROR: domain->groups does not contain"
+ " CPU%d\n", cpu);
+ }
+
+ printk(KERN_DEBUG "%*s groups:", level + 1, "");
+ do {
+ if (!group) {
+ printk("\n");
+ printk(KERN_ERR "ERROR: group is NULL\n");
+ break;
+ }
+
+ if (!cpumask_weight(sched_group_cpus(group))) {
+ printk(KERN_CONT "\n");
+ printk(KERN_ERR "ERROR: empty group\n");
+ break;
+ }
+
+ if (!(sd->flags & SD_OVERLAP) &&
+ cpumask_intersects(groupmask, sched_group_cpus(group))) {
+ printk(KERN_CONT "\n");
+ printk(KERN_ERR "ERROR: repeated CPUs\n");
+ break;
+ }
+
+ cpumask_or(groupmask, groupmask, sched_group_cpus(group));
+
+ printk(KERN_CONT " %*pbl",
+ cpumask_pr_args(sched_group_cpus(group)));
+ if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
+ printk(KERN_CONT " (cpu_capacity = %lu)",
+ group->sgc->capacity);
+ }
+
+ group = group->next;
+ } while (group != sd->groups);
+ printk(KERN_CONT "\n");
+
+ if (!cpumask_equal(sched_domain_span(sd), groupmask))
+ printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+ if (sd->parent &&
+ !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
+ printk(KERN_ERR "ERROR: parent span is not a superset "
+ "of domain->span\n");
+ return 0;
+}
+
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+ int level = 0;
+
+ if (!sched_debug_enabled)
+ return;
+
+ if (!sd) {
+ printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+ return;
+ }
+
+ printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+
+ for (;;) {
+ if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
+ break;
+ level++;
+ sd = sd->parent;
+ if (!sd)
+ break;
+ }
+}
+#else /* !CONFIG_SCHED_DEBUG */
+
+# define sched_debug_enabled 0
+# define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+ return false;
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+static int sd_degenerate(struct sched_domain *sd)
+{
+ if (cpumask_weight(sched_domain_span(sd)) == 1)
+ return 1;
+
+ /* Following flags need at least 2 groups */
+ if (sd->flags & (SD_LOAD_BALANCE |
+ SD_BALANCE_NEWIDLE |
+ SD_BALANCE_FORK |
+ SD_BALANCE_EXEC |
+ SD_SHARE_CPUCAPACITY |
+ SD_ASYM_CPUCAPACITY |
+ SD_SHARE_PKG_RESOURCES |
+ SD_SHARE_POWERDOMAIN)) {
+ if (sd->groups != sd->groups->next)
+ return 0;
+ }
+
+ /* Following flags don't use groups */
+ if (sd->flags & (SD_WAKE_AFFINE))
+ return 0;
+
+ return 1;
+}
+
+static int
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
+{
+ unsigned long cflags = sd->flags, pflags = parent->flags;
+
+ if (sd_degenerate(parent))
+ return 1;
+
+ if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
+ return 0;
+
+ /* Flags needing groups don't count if only 1 group in parent */
+ if (parent->groups == parent->groups->next) {
+ pflags &= ~(SD_LOAD_BALANCE |
+ SD_BALANCE_NEWIDLE |
+ SD_BALANCE_FORK |
+ SD_BALANCE_EXEC |
+ SD_ASYM_CPUCAPACITY |
+ SD_SHARE_CPUCAPACITY |
+ SD_SHARE_PKG_RESOURCES |
+ SD_PREFER_SIBLING |
+ SD_SHARE_POWERDOMAIN);
+ if (nr_node_ids == 1)
+ pflags &= ~SD_SERIALIZE;
+ }
+ if (~cflags & pflags)
+ return 0;
+
+ return 1;
+}
+
+static void free_rootdomain(struct rcu_head *rcu)
+{
+ struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
+
+ cpupri_cleanup(&rd->cpupri);
+ cpudl_cleanup(&rd->cpudl);
+ free_cpumask_var(rd->dlo_mask);
+ free_cpumask_var(rd->rto_mask);
+ free_cpumask_var(rd->online);
+ free_cpumask_var(rd->span);
+ kfree(rd);
+}
+
+void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+ struct root_domain *old_rd = NULL;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ if (rq->rd) {
+ old_rd = rq->rd;
+
+ if (cpumask_test_cpu(rq->cpu, old_rd->online))
+ set_rq_offline(rq);
+
+ cpumask_clear_cpu(rq->cpu, old_rd->span);
+
+ /*
+ * If we dont want to free the old_rd yet then
+ * set old_rd to NULL to skip the freeing later
+ * in this function:
+ */
+ if (!atomic_dec_and_test(&old_rd->refcount))
+ old_rd = NULL;
+ }
+
+ atomic_inc(&rd->refcount);
+ rq->rd = rd;
+
+ cpumask_set_cpu(rq->cpu, rd->span);
+ if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
+ set_rq_online(rq);
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ if (old_rd)
+ call_rcu_sched(&old_rd->rcu, free_rootdomain);
+}
+
+static int init_rootdomain(struct root_domain *rd)
+{
+ memset(rd, 0, sizeof(*rd));
+
+ if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
+ goto out;
+ if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
+ goto free_span;
+ if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
+ goto free_online;
+ if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ goto free_dlo_mask;
+
+ init_dl_bw(&rd->dl_bw);
+ if (cpudl_init(&rd->cpudl) != 0)
+ goto free_rto_mask;
+
+ if (cpupri_init(&rd->cpupri) != 0)
+ goto free_cpudl;
+ return 0;
+
+free_cpudl:
+ cpudl_cleanup(&rd->cpudl);
+free_rto_mask:
+ free_cpumask_var(rd->rto_mask);
+free_dlo_mask:
+ free_cpumask_var(rd->dlo_mask);
+free_online:
+ free_cpumask_var(rd->online);
+free_span:
+ free_cpumask_var(rd->span);
+out:
+ return -ENOMEM;
+}
+
+/*
+ * By default the system creates a single root-domain with all CPUs as
+ * members (mimicking the global state we have today).
+ */
+struct root_domain def_root_domain;
+
+void init_defrootdomain(void)
+{
+ init_rootdomain(&def_root_domain);
+
+ atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(void)
+{
+ struct root_domain *rd;
+
+ rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+ if (!rd)
+ return NULL;
+
+ if (init_rootdomain(rd) != 0) {
+ kfree(rd);
+ return NULL;
+ }
+
+ return rd;
+}
+
+static void free_sched_groups(struct sched_group *sg, int free_sgc)
+{
+ struct sched_group *tmp, *first;
+
+ if (!sg)
+ return;
+
+ first = sg;
+ do {
+ tmp = sg->next;
+
+ if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
+ kfree(sg->sgc);
+
+ kfree(sg);
+ sg = tmp;
+ } while (sg != first);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd)
+{
+ /*
+ * If its an overlapping domain it has private groups, iterate and
+ * nuke them all.
+ */
+ if (sd->flags & SD_OVERLAP) {
+ free_sched_groups(sd->groups, 1);
+ } else if (atomic_dec_and_test(&sd->groups->ref)) {
+ kfree(sd->groups->sgc);
+ kfree(sd->groups);
+ }
+ if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
+ kfree(sd->shared);
+ kfree(sd);
+}
+
+static void destroy_sched_domains_rcu(struct rcu_head *rcu)
+{
+ struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+ while (sd) {
+ struct sched_domain *parent = sd->parent;
+ destroy_sched_domain(sd);
+ sd = parent;
+ }
+}
+
+static void destroy_sched_domains(struct sched_domain *sd)
+{
+ if (sd)
+ call_rcu(&sd->rcu, destroy_sched_domains_rcu);
+}
+
+/*
+ * Keep a special pointer to the highest sched_domain that has
+ * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
+ * allows us to avoid some pointer chasing select_idle_sibling().
+ *
+ * Also keep a unique ID per domain (we use the first CPU number in
+ * the cpumask of the domain), this allows us to quickly tell if
+ * two CPUs are in the same cache domain, see cpus_share_cache().
+ */
+DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_size);
+DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+
+static void update_top_cache_domain(int cpu)
+{
+ struct sched_domain_shared *sds = NULL;
+ struct sched_domain *sd;
+ int id = cpu;
+ int size = 1;
+
+ sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+ if (sd) {
+ id = cpumask_first(sched_domain_span(sd));
+ size = cpumask_weight(sched_domain_span(sd));
+ sds = sd->shared;
+ }
+
+ rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+ per_cpu(sd_llc_size, cpu) = size;
+ per_cpu(sd_llc_id, cpu) = id;
+ rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
+
+ sd = lowest_flag_domain(cpu, SD_NUMA);
+ rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+ sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+ rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
+}
+
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
+ * hold the hotplug lock.
+ */
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct sched_domain *tmp;
+
+ /* Remove the sched domains which do not contribute to scheduling. */
+ for (tmp = sd; tmp; ) {
+ struct sched_domain *parent = tmp->parent;
+ if (!parent)
+ break;
+
+ if (sd_parent_degenerate(tmp, parent)) {
+ tmp->parent = parent->parent;
+ if (parent->parent)
+ parent->parent->child = tmp;
+ /*
+ * Transfer SD_PREFER_SIBLING down in case of a
+ * degenerate parent; the spans match for this
+ * so the property transfers.
+ */
+ if (parent->flags & SD_PREFER_SIBLING)
+ tmp->flags |= SD_PREFER_SIBLING;
+ destroy_sched_domain(parent);
+ } else
+ tmp = tmp->parent;
+ }
+
+ if (sd && sd_degenerate(sd)) {
+ tmp = sd;
+ sd = sd->parent;
+ destroy_sched_domain(tmp);
+ if (sd)
+ sd->child = NULL;
+ }
+
+ sched_domain_debug(sd, cpu);
+
+ rq_attach_root(rq, rd);
+ tmp = rq->sd;
+ rcu_assign_pointer(rq->sd, sd);
+ destroy_sched_domains(tmp);
+
+ update_top_cache_domain(cpu);
+}
+
+/* Setup the mask of CPUs configured for isolated domains */
+static int __init isolated_cpu_setup(char *str)
+{
+ int ret;
+
+ alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ ret = cpulist_parse(str, cpu_isolated_map);
+ if (ret) {
+ pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+ return 0;
+ }
+ return 1;
+}
+__setup("isolcpus=", isolated_cpu_setup);
+
+struct s_data {
+ struct sched_domain ** __percpu sd;
+ struct root_domain *rd;
+};
+
+enum s_alloc {
+ sa_rootdomain,
+ sa_sd,
+ sa_sd_storage,
+ sa_none,
+};
+
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * CPU they're built on, so check that.
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+ const struct cpumask *span = sched_domain_span(sd);
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *sibling;
+ int i;
+
+ for_each_cpu(i, span) {
+ sibling = *per_cpu_ptr(sdd->sd, i);
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+ continue;
+
+ cpumask_set_cpu(i, sched_group_mask(sg));
+ }
+}
+
+/*
+ * Return the canonical balance CPU for this group, this is the first CPU
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+ return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
+
+static int
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
+{
+ struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered = sched_domains_tmpmask;
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *sibling;
+ int i;
+
+ cpumask_clear(covered);
+
+ for_each_cpu(i, span) {
+ struct cpumask *sg_span;
+
+ if (cpumask_test_cpu(i, covered))
+ continue;
+
+ sibling = *per_cpu_ptr(sdd->sd, i);
+
+ /* See the comment near build_group_mask(). */
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+ continue;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(cpu));
+
+ if (!sg)
+ goto fail;
+
+ sg_span = sched_group_cpus(sg);
+ if (sibling->child)
+ cpumask_copy(sg_span, sched_domain_span(sibling->child));
+ else
+ cpumask_set_cpu(i, sg_span);
+
+ cpumask_or(covered, covered, sg_span);
+
+ sg->sgc = *per_cpu_ptr(sdd->sgc, i);
+ if (atomic_inc_return(&sg->sgc->ref) == 1)
+ build_group_mask(sd, sg);
+
+ /*
+ * Initialize sgc->capacity such that even if we mess up the
+ * domains and no possible iteration will get us here, we won't
+ * die on a /0 trap.
+ */
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+
+ /*
+ * Make sure the first group of this domain contains the
+ * canonical balance CPU. Otherwise the sched_domain iteration
+ * breaks. See update_sg_lb_stats().
+ */
+ if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+ group_balance_cpu(sg) == cpu)
+ groups = sg;
+
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
+ last->next = first;
+ }
+ sd->groups = groups;
+
+ return 0;
+
+fail:
+ free_sched_groups(first, 0);
+
+ return -ENOMEM;
+}
+
+static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
+{
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ struct sched_domain *child = sd->child;
+
+ if (child)
+ cpu = cpumask_first(sched_domain_span(child));
+
+ if (sg) {
+ *sg = *per_cpu_ptr(sdd->sg, cpu);
+ (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+
+ /* For claim_allocations: */
+ atomic_set(&(*sg)->sgc->ref, 1);
+ }
+
+ return cpu;
+}
+
+/*
+ * build_sched_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_capacity to 0.
+ *
+ * Assumes the sched_domain tree is fully constructed
+ */
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
+{
+ struct sched_group *first = NULL, *last = NULL;
+ struct sd_data *sdd = sd->private;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered;
+ int i;
+
+ get_group(cpu, sdd, &sd->groups);
+ atomic_inc(&sd->groups->ref);
+
+ if (cpu != cpumask_first(span))
+ return 0;
+
+ lockdep_assert_held(&sched_domains_mutex);
+ covered = sched_domains_tmpmask;
+
+ cpumask_clear(covered);
+
+ for_each_cpu(i, span) {
+ struct sched_group *sg;
+ int group, j;
+
+ if (cpumask_test_cpu(i, covered))
+ continue;
+
+ group = get_group(i, sdd, &sg);
+ cpumask_setall(sched_group_mask(sg));
+
+ for_each_cpu(j, span) {
+ if (get_group(j, sdd, NULL) != group)
+ continue;
+
+ cpumask_set_cpu(j, covered);
+ cpumask_set_cpu(j, sched_group_cpus(sg));
+ }
+
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
+ }
+ last->next = first;
+
+ return 0;
+}
+
+/*
+ * Initialize sched groups cpu_capacity.
+ *
+ * cpu_capacity indicates the capacity of sched group, which is used while
+ * distributing the load between different sched groups in a sched domain.
+ * Typically cpu_capacity for all the groups in a sched domain will be same
+ * unless there are asymmetries in the topology. If there are asymmetries,
+ * group having more cpu_capacity will pickup more load compared to the
+ * group having less cpu_capacity.
+ */
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
+{
+ struct sched_group *sg = sd->groups;
+
+ WARN_ON(!sg);
+
+ do {
+ int cpu, max_cpu = -1;
+
+ sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+
+ if (!(sd->flags & SD_ASYM_PACKING))
+ goto next;
+
+ for_each_cpu(cpu, sched_group_cpus(sg)) {
+ if (max_cpu < 0)
+ max_cpu = cpu;
+ else if (sched_asym_prefer(cpu, max_cpu))
+ max_cpu = cpu;
+ }
+ sg->asym_prefer_cpu = max_cpu;
+
+next:
+ sg = sg->next;
+ } while (sg != sd->groups);
+
+ if (cpu != group_balance_cpu(sg))
+ return;
+
+ update_group_capacity(sd, cpu);
+}
+
+/*
+ * Initializers for schedule domains
+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
+ */
+
+static int default_relax_domain_level = -1;
+int sched_domain_level_max;
+
+static int __init setup_relax_domain_level(char *str)
+{
+ if (kstrtoint(str, 0, &default_relax_domain_level))
+ pr_warn("Unable to set relax_domain_level\n");
+
+ return 1;
+}
+__setup("relax_domain_level=", setup_relax_domain_level);
+
+static void set_domain_attribute(struct sched_domain *sd,
+ struct sched_domain_attr *attr)
+{
+ int request;
+
+ if (!attr || attr->relax_domain_level < 0) {
+ if (default_relax_domain_level < 0)
+ return;
+ else
+ request = default_relax_domain_level;
+ } else
+ request = attr->relax_domain_level;
+ if (request < sd->level) {
+ /* Turn off idle balance on this domain: */
+ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+ } else {
+ /* Turn on idle balance on this domain: */
+ sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+ }
+}
+
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+ const struct cpumask *cpu_map)
+{
+ switch (what) {
+ case sa_rootdomain:
+ if (!atomic_read(&d->rd->refcount))
+ free_rootdomain(&d->rd->rcu);
+ /* Fall through */
+ case sa_sd:
+ free_percpu(d->sd);
+ /* Fall through */
+ case sa_sd_storage:
+ __sdt_free(cpu_map);
+ /* Fall through */
+ case sa_none:
+ break;
+ }
+}
+
+static enum s_alloc
+__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
+{
+ memset(d, 0, sizeof(*d));
+
+ if (__sdt_alloc(cpu_map))
+ return sa_sd_storage;
+ d->sd = alloc_percpu(struct sched_domain *);
+ if (!d->sd)
+ return sa_sd_storage;
+ d->rd = alloc_rootdomain();
+ if (!d->rd)
+ return sa_sd;
+ return sa_rootdomain;
+}
+
+/*
+ * NULL the sd_data elements we've used to build the sched_domain and
+ * sched_group structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
+{
+ struct sd_data *sdd = sd->private;
+
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+ *per_cpu_ptr(sdd->sd, cpu) = NULL;
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
+ *per_cpu_ptr(sdd->sds, cpu) = NULL;
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
+ *per_cpu_ptr(sdd->sg, cpu) = NULL;
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+ *per_cpu_ptr(sdd->sgc, cpu) = NULL;
+}
+
+#ifdef CONFIG_NUMA
+static int sched_domains_numa_levels;
+enum numa_topology_type sched_numa_topology_type;
+static int *sched_domains_numa_distance;
+int sched_max_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+#endif
+
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function:
+ *
+ * SD_SHARE_CPUCAPACITY - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN - describes shared power domain
+ * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
+ *
+ * SD_ASYM_PACKING - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS \
+ (SD_SHARE_CPUCAPACITY | \
+ SD_SHARE_PKG_RESOURCES | \
+ SD_NUMA | \
+ SD_ASYM_PACKING | \
+ SD_ASYM_CPUCAPACITY | \
+ SD_SHARE_POWERDOMAIN)
+
+static struct sched_domain *
+sd_init(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map,
+ struct sched_domain *child, int cpu)
+{
+ struct sd_data *sdd = &tl->data;
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ int sd_id, sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+ /*
+ * Ugly hack to pass state to sd_numa_mask()...
+ */
+ sched_domains_curr_level = tl->numa_level;
+#endif
+
+ sd_weight = cpumask_weight(tl->mask(cpu));
+
+ if (tl->sd_flags)
+ sd_flags = (*tl->sd_flags)();
+ if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+ "wrong sd_flags in topology description\n"))
+ sd_flags &= ~TOPOLOGY_SD_FLAGS;
+
+ *sd = (struct sched_domain){
+ .min_interval = sd_weight,
+ .max_interval = 2*sd_weight,
+ .busy_factor = 32,
+ .imbalance_pct = 125,
+
+ .cache_nice_tries = 0,
+ .busy_idx = 0,
+ .idle_idx = 0,
+ .newidle_idx = 0,
+ .wake_idx = 0,
+ .forkexec_idx = 0,
+
+ .flags = 1*SD_LOAD_BALANCE
+ | 1*SD_BALANCE_NEWIDLE
+ | 1*SD_BALANCE_EXEC
+ | 1*SD_BALANCE_FORK
+ | 0*SD_BALANCE_WAKE
+ | 1*SD_WAKE_AFFINE
+ | 0*SD_SHARE_CPUCAPACITY
+ | 0*SD_SHARE_PKG_RESOURCES
+ | 0*SD_SERIALIZE
+ | 0*SD_PREFER_SIBLING
+ | 0*SD_NUMA
+ | sd_flags
+ ,
+
+ .last_balance = jiffies,
+ .balance_interval = sd_weight,
+ .smt_gain = 0,
+ .max_newidle_lb_cost = 0,
+ .next_decay_max_lb_cost = jiffies,
+ .child = child,
+#ifdef CONFIG_SCHED_DEBUG
+ .name = tl->name,
+#endif
+ };
+
+ cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+ sd_id = cpumask_first(sched_domain_span(sd));
+
+ /*
+ * Convert topological properties into behaviour.
+ */
+
+ if (sd->flags & SD_ASYM_CPUCAPACITY) {
+ struct sched_domain *t = sd;
+
+ for_each_lower_domain(t)
+ t->flags |= SD_BALANCE_WAKE;
+ }
+
+ if (sd->flags & SD_SHARE_CPUCAPACITY) {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->imbalance_pct = 110;
+ sd->smt_gain = 1178; /* ~15% */
+
+ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->imbalance_pct = 117;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+ } else if (sd->flags & SD_NUMA) {
+ sd->cache_nice_tries = 2;
+ sd->busy_idx = 3;
+ sd->idle_idx = 2;
+
+ sd->flags |= SD_SERIALIZE;
+ if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ sd->flags &= ~(SD_BALANCE_EXEC |
+ SD_BALANCE_FORK |
+ SD_WAKE_AFFINE);
+ }
+
+#endif
+ } else {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+ sd->idle_idx = 1;
+ }
+
+ /*
+ * For all levels sharing cache; connect a sched_domain_shared
+ * instance.
+ */
+ if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+ atomic_inc(&sd->shared->ref);
+ atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
+ }
+
+ sd->private = sdd;
+
+ return sd;
+}
+
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
+static struct sched_domain_topology_level *sched_domain_topology =
+ default_topology;
+
+#define for_each_sd_topology(tl) \
+ for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+ if (WARN_ON_ONCE(sched_smp_initialized))
+ return;
+
+ sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+ return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_numa_warn(const char *str)
+{
+ static int done = false;
+ int i,j;
+
+ if (done)
+ return;
+
+ done = true;
+
+ printk(KERN_WARNING "ERROR: %s\n\n", str);
+
+ for (i = 0; i < nr_node_ids; i++) {
+ printk(KERN_WARNING " ");
+ for (j = 0; j < nr_node_ids; j++)
+ printk(KERN_CONT "%02d ", node_distance(i,j));
+ printk(KERN_CONT "\n");
+ }
+ printk(KERN_WARNING "\n");
+}
+
+bool find_numa_distance(int distance)
+{
+ int i;
+
+ if (distance == node_distance(0, 0))
+ return true;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ if (sched_domains_numa_distance[i] == distance)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * A system can have three types of NUMA topology:
+ * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
+ * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
+ * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
+ *
+ * The difference between a glueless mesh topology and a backplane
+ * topology lies in whether communication between not directly
+ * connected nodes goes through intermediary nodes (where programs
+ * could run), or through backplane controllers. This affects
+ * placement of programs.
+ *
+ * The type of topology can be discerned with the following tests:
+ * - If the maximum distance between any nodes is 1 hop, the system
+ * is directly connected.
+ * - If for two nodes A and B, located N > 1 hops away from each other,
+ * there is an intermediary node C, which is < N hops away from both
+ * nodes A and B, the system is a glueless mesh.
+ */
+static void init_numa_topology_type(void)
+{
+ int a, b, c, n;
+
+ n = sched_max_numa_distance;
+
+ if (sched_domains_numa_levels <= 1) {
+ sched_numa_topology_type = NUMA_DIRECT;
+ return;
+ }
+
+ for_each_online_node(a) {
+ for_each_online_node(b) {
+ /* Find two nodes furthest removed from each other. */
+ if (node_distance(a, b) < n)
+ continue;
+
+ /* Is there an intermediary node between a and b? */
+ for_each_online_node(c) {
+ if (node_distance(a, c) < n &&
+ node_distance(b, c) < n) {
+ sched_numa_topology_type =
+ NUMA_GLUELESS_MESH;
+ return;
+ }
+ }
+
+ sched_numa_topology_type = NUMA_BACKPLANE;
+ return;
+ }
+ }
+}
+
+void sched_init_numa(void)
+{
+ int next_distance, curr_distance = node_distance(0, 0);
+ struct sched_domain_topology_level *tl;
+ int level = 0;
+ int i, j, k;
+
+ sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+ if (!sched_domains_numa_distance)
+ return;
+
+ /*
+ * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+ * unique distances in the node_distance() table.
+ *
+ * Assumes node_distance(0,j) includes all distances in
+ * node_distance(i,j) in order to avoid cubic time.
+ */
+ next_distance = curr_distance;
+ for (i = 0; i < nr_node_ids; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ for (k = 0; k < nr_node_ids; k++) {
+ int distance = node_distance(i, k);
+
+ if (distance > curr_distance &&
+ (distance < next_distance ||
+ next_distance == curr_distance))
+ next_distance = distance;
+
+ /*
+ * While not a strong assumption it would be nice to know
+ * about cases where if node A is connected to B, B is not
+ * equally connected to A.
+ */
+ if (sched_debug() && node_distance(k, i) != distance)
+ sched_numa_warn("Node-distance not symmetric");
+
+ if (sched_debug() && i && !find_numa_distance(distance))
+ sched_numa_warn("Node-0 not representative");
+ }
+ if (next_distance != curr_distance) {
+ sched_domains_numa_distance[level++] = next_distance;
+ sched_domains_numa_levels = level;
+ curr_distance = next_distance;
+ } else break;
+ }
+
+ /*
+ * In case of sched_debug() we verify the above assumption.
+ */
+ if (!sched_debug())
+ break;
+ }
+
+ if (!level)
+ return;
+
+ /*
+ * 'level' contains the number of unique distances, excluding the
+ * identity distance node_distance(i,i).
+ *
+ * The sched_domains_numa_distance[] array includes the actual distance
+ * numbers.
+ */
+
+ /*
+ * Here, we should temporarily reset sched_domains_numa_levels to 0.
+ * If it fails to allocate memory for array sched_domains_numa_masks[][],
+ * the array will contain less then 'level' members. This could be
+ * dangerous when we use it to iterate array sched_domains_numa_masks[][]
+ * in other functions.
+ *
+ * We reset it to 'level' at the end of this function.
+ */
+ sched_domains_numa_levels = 0;
+
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+ if (!sched_domains_numa_masks)
+ return;
+
+ /*
+ * Now for each level, construct a mask per node which contains all
+ * CPUs of nodes that are that many hops away from us.
+ */
+ for (i = 0; i < level; i++) {
+ sched_domains_numa_masks[i] =
+ kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+ if (!sched_domains_numa_masks[i])
+ return;
+
+ for (j = 0; j < nr_node_ids; j++) {
+ struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ if (!mask)
+ return;
+
+ sched_domains_numa_masks[i][j] = mask;
+
+ for_each_node(k) {
+ if (node_distance(j, k) > sched_domains_numa_distance[i])
+ continue;
+
+ cpumask_or(mask, mask, cpumask_of_node(k));
+ }
+ }
+ }
+
+ /* Compute default topology size */
+ for (i = 0; sched_domain_topology[i].mask; i++);
+
+ tl = kzalloc((i + level + 1) *
+ sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+ if (!tl)
+ return;
+
+ /*
+ * Copy the default topology bits..
+ */
+ for (i = 0; sched_domain_topology[i].mask; i++)
+ tl[i] = sched_domain_topology[i];
+
+ /*
+ * .. and append 'j' levels of NUMA goodness.
+ */
+ for (j = 0; j < level; i++, j++) {
+ tl[i] = (struct sched_domain_topology_level){
+ .mask = sd_numa_mask,
+ .sd_flags = cpu_numa_flags,
+ .flags = SDTL_OVERLAP,
+ .numa_level = j,
+ SD_INIT_NAME(NUMA)
+ };
+ }
+
+ sched_domain_topology = tl;
+
+ sched_domains_numa_levels = level;
+ sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+
+ init_numa_topology_type();
+}
+
+void sched_domains_numa_masks_set(unsigned int cpu)
+{
+ int node = cpu_to_node(cpu);
+ int i, j;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ if (node_distance(j, node) <= sched_domains_numa_distance[i])
+ cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+ }
+}
+
+void sched_domains_numa_masks_clear(unsigned int cpu)
+{
+ int i, j;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++)
+ cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+}
+
+#endif /* CONFIG_NUMA */
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for_each_sd_topology(tl) {
+ struct sd_data *sdd = &tl->data;
+
+ sdd->sd = alloc_percpu(struct sched_domain *);
+ if (!sdd->sd)
+ return -ENOMEM;
+
+ sdd->sds = alloc_percpu(struct sched_domain_shared *);
+ if (!sdd->sds)
+ return -ENOMEM;
+
+ sdd->sg = alloc_percpu(struct sched_group *);
+ if (!sdd->sg)
+ return -ENOMEM;
+
+ sdd->sgc = alloc_percpu(struct sched_group_capacity *);
+ if (!sdd->sgc)
+ return -ENOMEM;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+ struct sched_domain_shared *sds;
+ struct sched_group *sg;
+ struct sched_group_capacity *sgc;
+
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sd)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sd, j) = sd;
+
+ sds = kzalloc_node(sizeof(struct sched_domain_shared),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sds)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sds, j) = sds;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sg)
+ return -ENOMEM;
+
+ sg->next = sg;
+
+ *per_cpu_ptr(sdd->sg, j) = sg;
+
+ sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sgc)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sgc, j) = sgc;
+ }
+ }
+
+ return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for_each_sd_topology(tl) {
+ struct sd_data *sdd = &tl->data;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+
+ if (sdd->sd) {
+ sd = *per_cpu_ptr(sdd->sd, j);
+ if (sd && (sd->flags & SD_OVERLAP))
+ free_sched_groups(sd->groups, 0);
+ kfree(*per_cpu_ptr(sdd->sd, j));
+ }
+
+ if (sdd->sds)
+ kfree(*per_cpu_ptr(sdd->sds, j));
+ if (sdd->sg)
+ kfree(*per_cpu_ptr(sdd->sg, j));
+ if (sdd->sgc)
+ kfree(*per_cpu_ptr(sdd->sgc, j));
+ }
+ free_percpu(sdd->sd);
+ sdd->sd = NULL;
+ free_percpu(sdd->sds);
+ sdd->sds = NULL;
+ free_percpu(sdd->sg);
+ sdd->sg = NULL;
+ free_percpu(sdd->sgc);
+ sdd->sgc = NULL;
+ }
+}
+
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ struct sched_domain *child, int cpu)
+{
+ struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
+
+ if (child) {
+ sd->level = child->level + 1;
+ sched_domain_level_max = max(sched_domain_level_max, sd->level);
+ child->parent = sd;
+
+ if (!cpumask_subset(sched_domain_span(child),
+ sched_domain_span(sd))) {
+ pr_err("BUG: arch topology borken\n");
+#ifdef CONFIG_SCHED_DEBUG
+ pr_err(" the %s domain not a subset of the %s domain\n",
+ child->name, sd->name);
+#endif
+ /* Fixup, ensure @sd has at least @child cpus. */
+ cpumask_or(sched_domain_span(sd),
+ sched_domain_span(sd),
+ sched_domain_span(child));
+ }
+
+ }
+ set_domain_attribute(sd, attr);
+
+ return sd;
+}
+
+/*
+ * Build sched domains for a given set of CPUs and attach the sched domains
+ * to the individual CPUs
+ */
+static int
+build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
+{
+ enum s_alloc alloc_state;
+ struct sched_domain *sd;
+ struct s_data d;
+ struct rq *rq = NULL;
+ int i, ret = -ENOMEM;
+
+ alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+ if (alloc_state != sa_rootdomain)
+ goto error;
+
+ /* Set up domains for CPUs specified by the cpu_map: */
+ for_each_cpu(i, cpu_map) {
+ struct sched_domain_topology_level *tl;
+
+ sd = NULL;
+ for_each_sd_topology(tl) {
+ sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+ if (tl == sched_domain_topology)
+ *per_cpu_ptr(d.sd, i) = sd;
+ if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+ sd->flags |= SD_OVERLAP;
+ if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+ break;
+ }
+ }
+
+ /* Build the groups for the domains */
+ for_each_cpu(i, cpu_map) {
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ sd->span_weight = cpumask_weight(sched_domain_span(sd));
+ if (sd->flags & SD_OVERLAP) {
+ if (build_overlap_sched_groups(sd, i))
+ goto error;
+ } else {
+ if (build_sched_groups(sd, i))
+ goto error;
+ }
+ }
+ }
+
+ /* Calculate CPU capacity for physical packages and nodes */
+ for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ if (!cpumask_test_cpu(i, cpu_map))
+ continue;
+
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ claim_allocations(i, sd);
+ init_sched_groups_capacity(i, sd);
+ }
+ }
+
+ /* Attach the domains */
+ rcu_read_lock();
+ for_each_cpu(i, cpu_map) {
+ rq = cpu_rq(i);
+ sd = *per_cpu_ptr(d.sd, i);
+
+ /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
+ if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
+ WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+
+ cpu_attach_domain(sd, d.rd, i);
+ }
+ rcu_read_unlock();
+
+ if (rq && sched_debug_enabled) {
+ pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
+ cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
+ }
+
+ ret = 0;
+error:
+ __free_domain_allocs(&d, alloc_state, cpu_map);
+ return ret;
+}
+
+/* Current sched domains: */
+static cpumask_var_t *doms_cur;
+
+/* Number of sched domains in 'doms_cur': */
+static int ndoms_cur;
+
+/* Attribues of custom domains in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;
+
+/*
+ * Special case: If a kmalloc() of a doms_cur partition (array of
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
+ */
+cpumask_var_t fallback_doms;
+
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * CPU core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __weak arch_update_cpu_topology(void)
+{
+ return 0;
+}
+
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+ int i;
+ cpumask_var_t *doms;
+
+ doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
+ if (!doms)
+ return NULL;
+ for (i = 0; i < ndoms; i++) {
+ if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+ free_sched_domains(doms, i);
+ return NULL;
+ }
+ }
+ return doms;
+}
+
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+ unsigned int i;
+ for (i = 0; i < ndoms; i++)
+ free_cpumask_var(doms[i]);
+ kfree(doms);
+}
+
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * For now this just excludes isolated CPUs, but could be used to
+ * exclude other special cases in the future.
+ */
+int init_sched_domains(const struct cpumask *cpu_map)
+{
+ int err;
+
+ arch_update_cpu_topology();
+ ndoms_cur = 1;
+ doms_cur = alloc_sched_domains(ndoms_cur);
+ if (!doms_cur)
+ doms_cur = &fallback_doms;
+ cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+ err = build_sched_domains(doms_cur[0], NULL);
+ register_sched_domain_sysctl();
+
+ return err;
+}
+
+/*
+ * Detach sched domains from a group of CPUs specified in cpu_map
+ * These CPUs will now be attached to the NULL domain
+ */
+static void detach_destroy_domains(const struct cpumask *cpu_map)
+{
+ int i;
+
+ rcu_read_lock();
+ for_each_cpu(i, cpu_map)
+ cpu_attach_domain(NULL, &def_root_domain, i);
+ rcu_read_unlock();
+}
+
+/* handle null as "default" */
+static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
+ struct sched_domain_attr *new, int idx_new)
+{
+ struct sched_domain_attr tmp;
+
+ /* Fast path: */
+ if (!new && !cur)
+ return 1;
+
+ tmp = SD_ATTR_INIT;
+ return !memcmp(cur ? (cur + idx_cur) : &tmp,
+ new ? (new + idx_new) : &tmp,
+ sizeof(struct sched_domain_attr));
+}
+
+/*
+ * Partition sched domains as specified by the 'ndoms_new'
+ * cpumasks in the array doms_new[] of cpumasks. This compares
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
+ * It destroys each deleted domain and builds each new domain.
+ *
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
+ * The masks don't intersect (don't overlap.) We should setup one
+ * sched domain for each mask. CPUs not in any of the cpumasks will
+ * not be load balanced. If the same cpumask appears both in the
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
+ * it as it is.
+ *
+ * The passed in 'doms_new' should be allocated using
+ * alloc_sched_domains. This routine takes ownership of it and will
+ * free_sched_domains it when done with it. If the caller failed the
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms', it also forces the domains to be rebuilt.
+ *
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
+ *
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
+{
+ int i, j, n;
+ int new_topology;
+
+ mutex_lock(&sched_domains_mutex);
+
+ /* Always unregister in case we don't destroy any domains: */
+ unregister_sched_domain_sysctl();
+
+ /* Let the architecture update CPU core mappings: */
+ new_topology = arch_update_cpu_topology();
+
+ n = doms_new ? ndoms_new : 0;
+
+ /* Destroy deleted domains: */
+ for (i = 0; i < ndoms_cur; i++) {
+ for (j = 0; j < n && !new_topology; j++) {
+ if (cpumask_equal(doms_cur[i], doms_new[j])
+ && dattrs_equal(dattr_cur, i, dattr_new, j))
+ goto match1;
+ }
+ /* No match - a current sched domain not in new doms_new[] */
+ detach_destroy_domains(doms_cur[i]);
+match1:
+ ;
+ }
+
+ n = ndoms_cur;
+ if (doms_new == NULL) {
+ n = 0;
+ doms_new = &fallback_doms;
+ cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+ WARN_ON_ONCE(dattr_new);
+ }
+
+ /* Build new domains: */
+ for (i = 0; i < ndoms_new; i++) {
+ for (j = 0; j < n && !new_topology; j++) {
+ if (cpumask_equal(doms_new[i], doms_cur[j])
+ && dattrs_equal(dattr_new, i, dattr_cur, j))
+ goto match2;
+ }
+ /* No match - add a new doms_new */
+ build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
+match2:
+ ;
+ }
+
+ /* Remember the new sched domains: */
+ if (doms_cur != &fallback_doms)
+ free_sched_domains(doms_cur, ndoms_cur);
+
+ kfree(dattr_cur);
+ doms_cur = doms_new;
+ dattr_cur = dattr_new;
+ ndoms_cur = ndoms_new;
+
+ register_sched_domain_sysctl();
+
+ mutex_unlock(&sched_domains_mutex);
+}
+
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index f15d6b6a538a..9453efe9b25a 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -196,27 +196,48 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
-long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+void init_wait_entry(wait_queue_t *wait, int flags)
{
- unsigned long flags;
-
- if (signal_pending_state(state, current))
- return -ERESTARTSYS;
-
+ wait->flags = flags;
wait->private = current;
wait->func = autoremove_wake_function;
+ INIT_LIST_HEAD(&wait->task_list);
+}
+EXPORT_SYMBOL(init_wait_entry);
+
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+ unsigned long flags;
+ long ret = 0;
spin_lock_irqsave(&q->lock, flags);
- if (list_empty(&wait->task_list)) {
- if (wait->flags & WQ_FLAG_EXCLUSIVE)
- __add_wait_queue_tail(q, wait);
- else
- __add_wait_queue(q, wait);
+ if (unlikely(signal_pending_state(state, current))) {
+ /*
+ * Exclusive waiter must not fail if it was selected by wakeup,
+ * it should "consume" the condition we were waiting for.
+ *
+ * The caller will recheck the condition and return success if
+ * we were already woken up, we can not miss the event because
+ * wakeup locks/unlocks the same q->lock.
+ *
+ * But we need to ensure that set-condition + wakeup after that
+ * can't see us, it should wake up another exclusive waiter if
+ * we fail.
+ */
+ list_del_init(&wait->task_list);
+ ret = -ERESTARTSYS;
+ } else {
+ if (list_empty(&wait->task_list)) {
+ if (wait->flags & WQ_FLAG_EXCLUSIVE)
+ __add_wait_queue_tail(q, wait);
+ else
+ __add_wait_queue(q, wait);
+ }
+ set_current_state(state);
}
- set_current_state(state);
spin_unlock_irqrestore(&q->lock, flags);
- return 0;
+ return ret;
}
EXPORT_SYMBOL(prepare_to_wait_event);
@@ -255,39 +276,6 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
}
EXPORT_SYMBOL(finish_wait);
-/**
- * abort_exclusive_wait - abort exclusive waiting in a queue
- * @q: waitqueue waited on
- * @wait: wait descriptor
- * @mode: runstate of the waiter to be woken
- * @key: key to identify a wait bit queue or %NULL
- *
- * Sets current thread back to running state and removes
- * the wait descriptor from the given waitqueue if still
- * queued.
- *
- * Wakes up the next waiter if the caller is concurrently
- * woken up through the queue.
- *
- * This prevents waiter starvation where an exclusive waiter
- * aborts and is woken up concurrently and no one wakes up
- * the next waiter.
- */
-void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
- unsigned int mode, void *key)
-{
- unsigned long flags;
-
- __set_current_state(TASK_RUNNING);
- spin_lock_irqsave(&q->lock, flags);
- if (!list_empty(&wait->task_list))
- list_del_init(&wait->task_list);
- else if (waitqueue_active(q))
- __wake_up_locked_key(q, mode, key);
- spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL(abort_exclusive_wait);
-
int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
int ret = default_wake_function(wait, mode, sync, key);
@@ -425,20 +413,29 @@ int __sched
__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
wait_bit_action_f *action, unsigned mode)
{
- do {
- int ret;
+ int ret = 0;
+ for (;;) {
prepare_to_wait_exclusive(wq, &q->wait, mode);
- if (!test_bit(q->key.bit_nr, q->key.flags))
- continue;
- ret = action(&q->key, mode);
- if (!ret)
- continue;
- abort_exclusive_wait(wq, &q->wait, mode, &q->key);
- return ret;
- } while (test_and_set_bit(q->key.bit_nr, q->key.flags));
- finish_wait(wq, &q->wait);
- return 0;
+ if (test_bit(q->key.bit_nr, q->key.flags)) {
+ ret = action(&q->key, mode);
+ /*
+ * See the comment in prepare_to_wait_event().
+ * finish_wait() does not necessarily takes wq->lock,
+ * but test_and_set_bit() implies mb() which pairs with
+ * smp_mb__after_atomic() before wake_up_page().
+ */
+ if (ret)
+ finish_wait(wq, &q->wait);
+ }
+ if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) {
+ if (!ret)
+ finish_wait(wq, &q->wait);
+ return 0;
+ } else if (ret) {
+ return ret;
+ }
+ }
}
EXPORT_SYMBOL(__wait_on_bit_lock);
@@ -483,16 +480,6 @@ void wake_up_bit(void *word, int bit)
}
EXPORT_SYMBOL(wake_up_bit);
-wait_queue_head_t *bit_waitqueue(void *word, int bit)
-{
- const int shift = BITS_PER_LONG == 32 ? 5 : 6;
- const struct zone *zone = page_zone(virt_to_page(word));
- unsigned long val = (unsigned long)word << shift | bit;
-
- return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
-}
-EXPORT_SYMBOL(bit_waitqueue);
-
/*
* Manipulate the atomic_t address to produce a better bit waitqueue table hash
* index (we're keying off bit -1, but that would produce a horrible hash
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 0db7c8a2afe2..f8f88ebcb3ba 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -16,6 +16,7 @@
#include <linux/atomic.h>
#include <linux/audit.h>
#include <linux/compat.h>
+#include <linux/coredump.h>
#include <linux/sched.h>
#include <linux/seccomp.h>
#include <linux/slab.h>
@@ -41,8 +42,7 @@
* outside of a lifetime-guarded section. In general, this
* is only needed for handling filters shared across tasks.
* @prev: points to a previously installed, or inherited, filter
- * @len: the number of instructions in the program
- * @insnsi: the BPF program instructions to evaluate
+ * @prog: the BPF program to evaluate
*
* seccomp_filter objects are organized in a tree linked via the @prev
* pointer. For any task, it appears to be a singly-linked list starting
@@ -168,8 +168,8 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
}
/**
- * seccomp_run_filters - evaluates all seccomp filters against @syscall
- * @syscall: number of the current system call
+ * seccomp_run_filters - evaluates all seccomp filters against @sd
+ * @sd: optional seccomp data to be passed to filters
*
* Returns valid seccomp BPF response codes.
*/
@@ -195,7 +195,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd)
* value always takes priority (ignoring the DATA).
*/
for (; f; f = f->prev) {
- u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
+ u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
ret = cur_ret;
@@ -487,6 +487,17 @@ void put_seccomp_filter(struct task_struct *tsk)
}
}
+static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)
+{
+ memset(info, 0, sizeof(*info));
+ info->si_signo = SIGSYS;
+ info->si_code = SYS_SECCOMP;
+ info->si_call_addr = (void __user *)KSTK_EIP(current);
+ info->si_errno = reason;
+ info->si_arch = syscall_get_arch();
+ info->si_syscall = syscall;
+}
+
/**
* seccomp_send_sigsys - signals the task to allow in-process syscall emulation
* @syscall: syscall number to send to userland
@@ -497,13 +508,7 @@ void put_seccomp_filter(struct task_struct *tsk)
static void seccomp_send_sigsys(int syscall, int reason)
{
struct siginfo info;
- memset(&info, 0, sizeof(info));
- info.si_signo = SIGSYS;
- info.si_code = SYS_SECCOMP;
- info.si_call_addr = (void __user *)KSTK_EIP(current);
- info.si_errno = reason;
- info.si_arch = syscall_get_arch();
- info.si_syscall = syscall;
+ seccomp_init_siginfo(&info, syscall, reason);
force_sig_info(SIGSYS, &info, current);
}
#endif /* CONFIG_SECCOMP_FILTER */
@@ -635,10 +640,17 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
return 0;
case SECCOMP_RET_KILL:
- default:
+ default: {
+ siginfo_t info;
audit_seccomp(this_syscall, SIGSYS, action);
+ /* Show the original registers in the dump. */
+ syscall_rollback(current, task_pt_regs(current));
+ /* Trigger a manual coredump since do_exit skips it. */
+ seccomp_init_siginfo(&info, this_syscall, data);
+ do_coredump(&info);
do_exit(SIGSYS);
}
+ }
unreachable();
diff --git a/kernel/signal.c b/kernel/signal.c
index af21afc00d08..13f9def8b24a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -39,7 +39,7 @@
#include <trace/events/signal.h>
#include <asm/param.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/siginfo.h>
#include <asm/cacheflush.h>
@@ -346,7 +346,7 @@ static bool task_participate_group_stop(struct task_struct *task)
* fresh group stop. Read comment in do_signal_stop() for details.
*/
if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
- sig->flags = SIGNAL_STOP_STOPPED;
+ signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED);
return true;
}
return false;
@@ -427,6 +427,7 @@ void flush_signals(struct task_struct *t)
spin_unlock_irqrestore(&t->sighand->siglock, flags);
}
+#ifdef CONFIG_POSIX_TIMERS
static void __flush_itimer_signals(struct sigpending *pending)
{
sigset_t signal, retain;
@@ -460,6 +461,7 @@ void flush_itimer_signals(void)
__flush_itimer_signals(&tsk->signal->shared_pending);
spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
}
+#endif
void ignore_signals(struct task_struct *t)
{
@@ -567,6 +569,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
if (!signr) {
signr = __dequeue_signal(&tsk->signal->shared_pending,
mask, info);
+#ifdef CONFIG_POSIX_TIMERS
/*
* itimer signal ?
*
@@ -584,12 +587,13 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
struct hrtimer *tmr = &tsk->signal->real_timer;
if (!hrtimer_is_queued(tmr) &&
- tsk->signal->it_real_incr.tv64 != 0) {
+ tsk->signal->it_real_incr != 0) {
hrtimer_forward(tmr, tmr->base->get_time(),
tsk->signal->it_real_incr);
hrtimer_restart(tmr);
}
}
+#endif
}
recalc_sigpending();
@@ -611,6 +615,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
*/
current->jobctl |= JOBCTL_STOP_DEQUEUED;
}
+#ifdef CONFIG_POSIX_TIMERS
if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
/*
* Release the siglock to ensure proper locking order
@@ -622,6 +627,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
do_schedule_next_timer(info);
spin_lock(&tsk->sighand->siglock);
}
+#endif
return signr;
}
@@ -837,7 +843,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
* will take ->siglock, notice SIGNAL_CLD_MASK, and
* notify its parent. See get_signal_to_deliver().
*/
- signal->flags = why | SIGNAL_STOP_CONTINUED;
+ signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED);
signal->group_stop_count = 0;
signal->group_exit_code = 0;
}
@@ -1575,7 +1581,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
unsigned long flags;
struct sighand_struct *psig;
bool autoreap = false;
- cputime_t utime, stime;
+ u64 utime, stime;
BUG_ON(sig == -1);
@@ -1614,8 +1620,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
rcu_read_unlock();
task_cputime(tsk, &utime, &stime);
- info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime);
- info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime);
+ info.si_utime = nsec_to_clock_t(utime + tsk->signal->utime);
+ info.si_stime = nsec_to_clock_t(stime + tsk->signal->stime);
info.si_status = tsk->exit_code & 0x7f;
if (tsk->exit_code & 0x80)
@@ -1679,7 +1685,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
unsigned long flags;
struct task_struct *parent;
struct sighand_struct *sighand;
- cputime_t utime, stime;
+ u64 utime, stime;
if (for_ptracer) {
parent = tsk->parent;
@@ -1699,8 +1705,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
rcu_read_unlock();
task_cputime(tsk, &utime, &stime);
- info.si_utime = cputime_to_clock_t(utime);
- info.si_stime = cputime_to_clock_t(stime);
+ info.si_utime = nsec_to_clock_t(utime);
+ info.si_stime = nsec_to_clock_t(stime);
info.si_code = why;
switch (why) {
@@ -2485,6 +2491,13 @@ void __set_current_blocked(const sigset_t *newset)
{
struct task_struct *tsk = current;
+ /*
+ * In case the signal mask hasn't changed, there is nothing we need
+ * to do. The current->blocked shouldn't be modified by other task.
+ */
+ if (sigequalsets(&tsk->blocked, newset))
+ return;
+
spin_lock_irq(&tsk->sighand->siglock);
__set_task_blocked(tsk, newset);
spin_unlock_irq(&tsk->sighand->siglock);
@@ -2753,7 +2766,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
const struct timespec *ts)
{
- ktime_t *to = NULL, timeout = { .tv64 = KTIME_MAX };
+ ktime_t *to = NULL, timeout = KTIME_MAX;
struct task_struct *tsk = current;
sigset_t mask = *which;
int sig, ret = 0;
@@ -2773,7 +2786,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
spin_lock_irq(&tsk->sighand->siglock);
sig = dequeue_signal(tsk, &mask, info);
- if (!sig && timeout.tv64) {
+ if (!sig && timeout) {
/*
* None ready, temporarily unblock those we're interested
* while we are sleeping in so that we'll be awakened when
@@ -3044,6 +3057,11 @@ void kernel_sigaction(int sig, __sighandler_t action)
}
EXPORT_SYMBOL(kernel_sigaction);
+void __weak sigaction_compat_abi(struct k_sigaction *act,
+ struct k_sigaction *oact)
+{
+}
+
int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
{
struct task_struct *p = current, *t;
@@ -3059,6 +3077,8 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
if (oact)
*oact = *k;
+ sigaction_compat_abi(act, oact);
+
if (act) {
sigdelsetmask(&act->sa.sa_mask,
sigmask(SIGKILL) | sigmask(SIGSTOP));
diff --git a/kernel/smp.c b/kernel/smp.c
index 3aa642d39c03..77fcdb9f2775 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -3,6 +3,9 @@
*
* (C) Jens Axboe <jens.axboe@oracle.com> 2008
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/irq_work.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
@@ -14,6 +17,7 @@
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/sched.h>
+#include <linux/hypervisor.h>
#include "smpboot.h"
@@ -542,19 +546,17 @@ void __init setup_nr_cpu_ids(void)
nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
}
-void __weak smp_announce(void)
-{
- printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus());
-}
-
/* Called by boot processor to activate the rest. */
void __init smp_init(void)
{
+ int num_nodes, num_cpus;
unsigned int cpu;
idle_threads_init();
cpuhp_threads_init();
+ pr_info("Bringing up secondary CPUs ...\n");
+
/* FIXME: This should be done in userspace --RR */
for_each_present_cpu(cpu) {
if (num_online_cpus() >= setup_max_cpus)
@@ -563,8 +565,13 @@ void __init smp_init(void)
cpu_up(cpu);
}
+ num_nodes = num_online_nodes();
+ num_cpus = num_online_cpus();
+ pr_info("Brought up %d node%s, %d CPU%s\n",
+ num_nodes, (num_nodes > 1 ? "s" : ""),
+ num_cpus, (num_cpus > 1 ? "s" : ""));
+
/* Any cleanup work */
- smp_announce();
smp_cpus_done(setup_max_cpus);
}
@@ -724,3 +731,54 @@ void wake_up_all_idle_cpus(void)
preempt_enable();
}
EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
+
+/**
+ * smp_call_on_cpu - Call a function on a specific cpu
+ *
+ * Used to call a function on a specific cpu and wait for it to return.
+ * Optionally make sure the call is done on a specified physical cpu via vcpu
+ * pinning in order to support virtualized environments.
+ */
+struct smp_call_on_cpu_struct {
+ struct work_struct work;
+ struct completion done;
+ int (*func)(void *);
+ void *data;
+ int ret;
+ int cpu;
+};
+
+static void smp_call_on_cpu_callback(struct work_struct *work)
+{
+ struct smp_call_on_cpu_struct *sscs;
+
+ sscs = container_of(work, struct smp_call_on_cpu_struct, work);
+ if (sscs->cpu >= 0)
+ hypervisor_pin_vcpu(sscs->cpu);
+ sscs->ret = sscs->func(sscs->data);
+ if (sscs->cpu >= 0)
+ hypervisor_pin_vcpu(-1);
+
+ complete(&sscs->done);
+}
+
+int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
+{
+ struct smp_call_on_cpu_struct sscs = {
+ .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done),
+ .func = func,
+ .data = par,
+ .cpu = phys ? cpu : -1,
+ };
+
+ INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback);
+
+ if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+ return -ENXIO;
+
+ queue_work_on(cpu, system_wq, &sscs.work);
+ wait_for_completion(&sscs.done);
+
+ return sscs.ret;
+}
+EXPORT_SYMBOL_GPL(smp_call_on_cpu);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 13bc43d1fb22..4a5c6e73ecd4 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -186,6 +186,11 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
kfree(td);
return PTR_ERR(tsk);
}
+ /*
+ * Park the thread so that it could start right on the CPU
+ * when it is available.
+ */
+ kthread_park(tsk);
get_task_struct(tsk);
*per_cpu_ptr(ht->store, cpu) = tsk;
if (ht->create) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 17caf4b63342..744fa611cae0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
const char * const softirq_to_name[NR_SOFTIRQS] = {
- "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
+ "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
"TASKLET", "SCHED", "HRTIMER", "RCU"
};
@@ -78,6 +78,17 @@ static void wakeup_softirqd(void)
}
/*
+ * If ksoftirqd is scheduled, we do not want to process pending softirqs
+ * right now. Let ksoftirqd handle this at its own rate, to get fairness.
+ */
+static bool ksoftirqd_running(void)
+{
+ struct task_struct *tsk = __this_cpu_read(ksoftirqd);
+
+ return tsk && (tsk->state == TASK_RUNNING);
+}
+
+/*
* preempt_count and SOFTIRQ_OFFSET usage:
* - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
* softirq processing.
@@ -313,7 +324,7 @@ asmlinkage __visible void do_softirq(void)
pending = local_softirq_pending();
- if (pending)
+ if (pending && !ksoftirqd_running())
do_softirq_own_stack();
local_irq_restore(flags);
@@ -340,6 +351,9 @@ void irq_enter(void)
static inline void invoke_softirq(void)
{
+ if (ksoftirqd_running())
+ return;
+
if (!force_irqthreads) {
#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
/*
@@ -482,7 +496,7 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
}
EXPORT_SYMBOL(__tasklet_hi_schedule_first);
-static void tasklet_action(struct softirq_action *a)
+static __latent_entropy void tasklet_action(struct softirq_action *a)
{
struct tasklet_struct *list;
@@ -518,7 +532,7 @@ static void tasklet_action(struct softirq_action *a)
}
}
-static void tasklet_hi_action(struct softirq_action *a)
+static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
{
struct tasklet_struct *list;
@@ -700,7 +714,7 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
BUG();
}
-static void takeover_tasklets(unsigned int cpu)
+static int takeover_tasklets(unsigned int cpu)
{
/* CPU is dead, so no lock needed. */
local_irq_disable();
@@ -723,27 +737,12 @@ static void takeover_tasklets(unsigned int cpu)
raise_softirq_irqoff(HI_SOFTIRQ);
local_irq_enable();
+ return 0;
}
+#else
+#define takeover_tasklets NULL
#endif /* CONFIG_HOTPLUG_CPU */
-static int cpu_callback(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
-{
- switch (action) {
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- takeover_tasklets((unsigned long)hcpu);
- break;
-#endif /* CONFIG_HOTPLUG_CPU */
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block cpu_nfb = {
- .notifier_call = cpu_callback
-};
-
static struct smp_hotplug_thread softirq_threads = {
.store = &ksoftirqd,
.thread_should_run = ksoftirqd_should_run,
@@ -753,8 +752,8 @@ static struct smp_hotplug_thread softirq_threads = {
static __init int spawn_ksoftirqd(void)
{
- register_cpu_notifier(&cpu_nfb);
-
+ cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
+ takeover_tasklets);
BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
return 0;
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index b6e4c16377c7..9c15a9124e83 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -18,10 +18,8 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
if (WARN_ON(!trace->entries))
return;
- for (i = 0; i < trace->nr_entries; i++) {
- printk("%*c", 1 + spaces, ' ');
- print_ip_sym(trace->entries[i]);
- }
+ for (i = 0; i < trace->nr_entries; i++)
+ printk("%*c%pS\n", 1 + spaces, ' ', (void *)trace->entries[i]);
}
EXPORT_SYMBOL_GPL(print_stack_trace);
@@ -29,7 +27,6 @@ int snprint_stack_trace(char *buf, size_t size,
struct stack_trace *trace, int spaces)
{
int i;
- unsigned long ip;
int generated;
int total = 0;
@@ -37,9 +34,8 @@ int snprint_stack_trace(char *buf, size_t size,
return 0;
for (i = 0; i < trace->nr_entries; i++) {
- ip = trace->entries[i];
- generated = snprintf(buf, size, "%*c[<%p>] %pS\n",
- 1 + spaces, ' ', (void *) ip, (void *) ip);
+ generated = snprintf(buf, size, "%*c%pS\n", 1 + spaces, ' ',
+ (void *)trace->entries[i]);
total += generated;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 4a1ca5f6da7e..1eb82661ecdb 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -20,7 +20,6 @@
#include <linux/kallsyms.h>
#include <linux/smpboot.h>
#include <linux/atomic.h>
-#include <linux/lglock.h>
#include <linux/nmi.h>
/*
@@ -47,13 +46,9 @@ struct cpu_stopper {
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
static bool stop_machine_initialized = false;
-/*
- * Avoids a race between stop_two_cpus and global stop_cpus, where
- * the stoppers could get queued up in reverse order, leading to
- * system deadlock. Using an lglock means stop_two_cpus remains
- * relatively cheap.
- */
-DEFINE_STATIC_LGLOCK(stop_cpus_lock);
+/* static data for stop_cpus */
+static DEFINE_MUTEX(stop_cpus_mutex);
+static bool stop_cpus_in_progress;
static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
{
@@ -126,6 +121,11 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
cpu_stop_init_done(&done, 1);
if (!cpu_stop_queue_work(cpu, &work))
return -ENOENT;
+ /*
+ * In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup
+ * cycle by doing a preemption:
+ */
+ cond_resched();
wait_for_completion(&done.completion);
return done.ret;
}
@@ -194,7 +194,7 @@ static int multi_cpu_stop(void *data)
/* Simple state machine */
do {
/* Chill out and ensure we re-read multi_stop_state. */
- cpu_relax();
+ cpu_relax_yield();
if (msdata->state != curstate) {
curstate = msdata->state;
switch (curstate) {
@@ -230,14 +230,26 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
int err;
-
- lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
+retry:
spin_lock_irq(&stopper1->lock);
spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
err = -ENOENT;
if (!stopper1->enabled || !stopper2->enabled)
goto unlock;
+ /*
+ * Ensure that if we race with __stop_cpus() the stoppers won't get
+ * queued up in reverse order leading to system deadlock.
+ *
+ * We can't miss stop_cpus_in_progress if queue_stop_cpus_work() has
+ * queued a work on cpu1 but not on cpu2, we hold both locks.
+ *
+ * It can be falsely true but it is safe to spin until it is cleared,
+ * queue_stop_cpus_work() does everything under preempt_disable().
+ */
+ err = -EDEADLK;
+ if (unlikely(stop_cpus_in_progress))
+ goto unlock;
err = 0;
__cpu_stop_queue_work(stopper1, work1);
@@ -245,8 +257,12 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
unlock:
spin_unlock(&stopper2->lock);
spin_unlock_irq(&stopper1->lock);
- lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+ if (unlikely(err == -EDEADLK)) {
+ while (stop_cpus_in_progress)
+ cpu_relax();
+ goto retry;
+ }
return err;
}
/**
@@ -316,9 +332,6 @@ bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
return cpu_stop_queue_work(cpu, work_buf);
}
-/* static data for stop_cpus */
-static DEFINE_MUTEX(stop_cpus_mutex);
-
static bool queue_stop_cpus_work(const struct cpumask *cpumask,
cpu_stop_fn_t fn, void *arg,
struct cpu_stop_done *done)
@@ -332,7 +345,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
* preempted by a stopper which might wait for other stoppers
* to enter @fn which can lead to deadlock.
*/
- lg_global_lock(&stop_cpus_lock);
+ preempt_disable();
+ stop_cpus_in_progress = true;
for_each_cpu(cpu, cpumask) {
work = &per_cpu(cpu_stopper.stop_work, cpu);
work->fn = fn;
@@ -341,7 +355,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
if (cpu_stop_queue_work(cpu, work))
queued = true;
}
- lg_global_unlock(&stop_cpus_lock);
+ stop_cpus_in_progress = false;
+ preempt_enable();
return queued;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 89d5be418157..7d4a9a6df956 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -57,7 +57,7 @@
/* Move somewhere else to avoid recompiling? */
#include <generated/utsrelease.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/io.h>
#include <asm/unistd.h>
@@ -881,15 +881,15 @@ SYSCALL_DEFINE0(getegid)
void do_sys_times(struct tms *tms)
{
- cputime_t tgutime, tgstime, cutime, cstime;
+ u64 tgutime, tgstime, cutime, cstime;
thread_group_cputime_adjusted(current, &tgutime, &tgstime);
cutime = current->signal->cutime;
cstime = current->signal->cstime;
- tms->tms_utime = cputime_to_clock_t(tgutime);
- tms->tms_stime = cputime_to_clock_t(tgstime);
- tms->tms_cutime = cputime_to_clock_t(cutime);
- tms->tms_cstime = cputime_to_clock_t(cstime);
+ tms->tms_utime = nsec_to_clock_t(tgutime);
+ tms->tms_stime = nsec_to_clock_t(tgstime);
+ tms->tms_cutime = nsec_to_clock_t(cutime);
+ tms->tms_cstime = nsec_to_clock_t(cstime);
}
SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
@@ -1416,7 +1416,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
* applications, so we live with it
*/
if (!retval && new_rlim && resource == RLIMIT_CPU &&
- new_rlim->rlim_cur != RLIM_INFINITY)
+ new_rlim->rlim_cur != RLIM_INFINITY &&
+ IS_ENABLED(CONFIG_POSIX_TIMERS))
update_rlimit_cpu(tsk, new_rlim->rlim_cur);
out:
read_unlock(&tasklist_lock);
@@ -1543,7 +1544,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
{
struct task_struct *t;
unsigned long flags;
- cputime_t tgutime, tgstime, utime, stime;
+ u64 tgutime, tgstime, utime, stime;
unsigned long maxrss = 0;
memset((char *)r, 0, sizeof (*r));
@@ -1599,8 +1600,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
unlock_task_sighand(p, &flags);
out:
- cputime_to_timeval(utime, &r->ru_utime);
- cputime_to_timeval(stime, &r->ru_stime);
+ r->ru_utime = ns_to_timeval(utime);
+ r->ru_stime = ns_to_timeval(stime);
if (who != RUSAGE_CHILDREN) {
struct mm_struct *mm = get_task_mm(p);
@@ -1696,16 +1697,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
fput(exe_file);
}
- /*
- * The symlink can be changed only once, just to disallow arbitrary
- * transitions malicious software might bring in. This means one
- * could make a snapshot over all processes running and monitor
- * /proc/pid/exe changes to notice unusual activity if needed.
- */
- err = -EPERM;
- if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
- goto exit;
-
err = 0;
/* set the new file, lockless */
get_file(exe.file);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2c5e3a8e00d7..8acef8576ce9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -150,6 +150,9 @@ cond_syscall(sys_io_destroy);
cond_syscall(sys_io_submit);
cond_syscall(sys_io_cancel);
cond_syscall(sys_io_getevents);
+cond_syscall(compat_sys_io_setup);
+cond_syscall(compat_sys_io_submit);
+cond_syscall(compat_sys_io_getevents);
cond_syscall(sys_sysfs);
cond_syscall(sys_syslog);
cond_syscall(sys_process_vm_readv);
@@ -250,3 +253,8 @@ cond_syscall(sys_execveat);
/* membarrier */
cond_syscall(sys_membarrier);
+
+/* memory protection keys */
+cond_syscall(sys_pkey_mprotect);
+cond_syscall(sys_pkey_alloc);
+cond_syscall(sys_pkey_free);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a13bbdaab47d..bb260ceb3718 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -65,8 +65,9 @@
#include <linux/sched/sysctl.h>
#include <linux/kexec.h>
#include <linux/bpf.h>
+#include <linux/mount.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/processor.h>
#ifdef CONFIG_X86
@@ -106,9 +107,8 @@ extern unsigned int core_pipe_limit;
extern int pid_max;
extern int pid_max_min, pid_max_max;
extern int percpu_pagelist_fraction;
-extern int compat_log;
extern int latencytop_enabled;
-extern int sysctl_nr_open_min, sysctl_nr_open_max;
+extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;
#ifndef CONFIG_MMU
extern int sysctl_nr_trim_pages;
#endif
@@ -347,13 +347,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
- .procname = "sched_shares_window_ns",
- .data = &sysctl_sched_shares_window,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",
@@ -423,7 +416,7 @@ static struct ctl_table kern_table[] = {
},
{
.procname = "sched_rr_timeslice_ms",
- .data = &sched_rr_timeslice,
+ .data = &sysctl_sched_rr_timeslice,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = sched_rr_handler,
@@ -634,7 +627,7 @@ static struct ctl_table kern_table[] = {
.data = &tracepoint_printk,
.maxlen = sizeof(tracepoint_printk),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = tracepoint_printk_sysctl,
},
#endif
#ifdef CONFIG_KEXEC_CORE
@@ -990,13 +983,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
{
- .procname = "kstack_depth_to_print",
- .data = &kstack_depth_to_print,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
.procname = "io_delay_type",
.data = &io_delay_type,
.maxlen = sizeof(int),
@@ -1084,15 +1070,6 @@ static struct ctl_table kern_table[] = {
.extra1 = &neg_one,
},
#endif
-#ifdef CONFIG_COMPAT
- {
- .procname = "compat-log",
- .data = &compat_log,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
#ifdef CONFIG_RT_MUTEXES
{
.procname = "max_lock_depth",
@@ -1692,7 +1669,7 @@ static struct ctl_table fs_table[] = {
{
.procname = "nr_open",
.data = &sysctl_nr_open,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &sysctl_nr_open_min,
@@ -1838,6 +1815,14 @@ static struct ctl_table fs_table[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
+ {
+ .procname = "mount-max",
+ .data = &sysctl_mount_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ },
{ }
};
@@ -2404,9 +2389,11 @@ static void validate_coredump_safety(void)
#ifdef CONFIG_COREDUMP
if (suid_dumpable == SUID_DUMP_ROOT &&
core_pattern[0] != '/' && core_pattern[0] != '|') {
- printk(KERN_WARNING "Unsafe core_pattern used with "\
- "suid_dumpable=2. Pipe handler or fully qualified "\
- "core dump path required.\n");
+ printk(KERN_WARNING
+"Unsafe core_pattern used with fs.suid_dumpable=2.\n"
+"Pipe handler or fully qualified core dump path required.\n"
+"Set kernel.core_pattern before fs.suid_dumpable.\n"
+ );
}
#endif
}
@@ -2488,6 +2475,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
break;
if (neg)
continue;
+ val = convmul * val / convdiv;
if ((min && val < *min) || (max && val > *max))
continue;
*i = val;
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 6eb99c17dbd8..ece4b177052b 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1354,8 +1354,8 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
"warning: process `%s' used the deprecated sysctl "
"system call with ", current->comm);
for (i = 0; i < nlen; i++)
- printk("%d.", name[i]);
- printk("\n");
+ printk(KERN_CONT "%d.", name[i]);
+ printk(KERN_CONT "\n");
}
return;
}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b3f05ee20d18..8a5e44236f78 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -41,12 +41,7 @@ static DEFINE_PER_CPU(__u32, taskstats_seqnum);
static int family_registered;
struct kmem_cache *taskstats_cache;
-static struct genl_family family = {
- .id = GENL_ID_GENERATE,
- .name = TASKSTATS_GENL_NAME,
- .version = TASKSTATS_GENL_VERSION,
- .maxattr = TASKSTATS_CMD_ATTR_MAX,
-};
+static struct genl_family family;
static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
[TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
@@ -54,7 +49,11 @@ static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1
[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
-static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
+/*
+ * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family.
+ * Make sure they are always aligned.
+ */
+static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
[CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
};
@@ -651,6 +650,15 @@ static const struct genl_ops taskstats_ops[] = {
},
};
+static struct genl_family family __ro_after_init = {
+ .name = TASKSTATS_GENL_NAME,
+ .version = TASKSTATS_GENL_VERSION,
+ .maxattr = TASKSTATS_CMD_ATTR_MAX,
+ .module = THIS_MODULE,
+ .ops = taskstats_ops,
+ .n_ops = ARRAY_SIZE(taskstats_ops),
+};
+
/* Needed early in initialization */
void __init taskstats_init_early(void)
{
@@ -667,7 +675,7 @@ static int __init taskstats_init(void)
{
int rc;
- rc = genl_register_family_with_ops(&family, taskstats_ops);
+ rc = genl_register_family(&family);
if (rc)
return rc;
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 49eca0beed32..938dbf33ef49 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,6 +1,12 @@
-obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
+obj-y += time.o timer.o hrtimer.o
obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
-obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
+obj-y += timeconv.o timecounter.o alarmtimer.o
+
+ifeq ($(CONFIG_POSIX_TIMERS),y)
+ obj-y += posix-timers.o posix-cpu-timers.o posix-clock.o itimer.o
+else
+ obj-y += posix-stubs.o
+endif
obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o
ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
@@ -9,6 +15,5 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
endif
obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
-obj-$(CONFIG_TIMER_STATS) += timer_stats.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index c3aad685bbc0..e6dc9a538efa 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -26,6 +26,9 @@
#include <linux/workqueue.h>
#include <linux/freezer.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/alarmtimer.h>
+
/**
* struct alarm_base - Alarm timer bases
* @lock: Lock for syncrhonized access to the base
@@ -40,7 +43,9 @@ static struct alarm_base {
clockid_t base_clockid;
} alarm_bases[ALARM_NUMTYPE];
-/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */
+/* freezer information to handle clock_nanosleep triggered wakeups */
+static enum alarmtimer_type freezer_alarmtype;
+static ktime_t freezer_expires;
static ktime_t freezer_delta;
static DEFINE_SPINLOCK(freezer_delta_lock);
@@ -194,6 +199,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
}
spin_unlock_irqrestore(&base->lock, flags);
+ trace_alarmtimer_fired(alarm, base->gettime());
return ret;
}
@@ -218,16 +224,17 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining);
*/
static int alarmtimer_suspend(struct device *dev)
{
- struct rtc_time tm;
- ktime_t min, now;
- unsigned long flags;
+ ktime_t min, now, expires;
+ int i, ret, type;
struct rtc_device *rtc;
- int i;
- int ret;
+ unsigned long flags;
+ struct rtc_time tm;
spin_lock_irqsave(&freezer_delta_lock, flags);
min = freezer_delta;
- freezer_delta = ktime_set(0, 0);
+ expires = freezer_expires;
+ type = freezer_alarmtype;
+ freezer_delta = 0;
spin_unlock_irqrestore(&freezer_delta_lock, flags);
rtc = alarmtimer_get_rtcdev();
@@ -247,10 +254,13 @@ static int alarmtimer_suspend(struct device *dev)
if (!next)
continue;
delta = ktime_sub(next->expires, base->gettime());
- if (!min.tv64 || (delta.tv64 < min.tv64))
+ if (!min || (delta < min)) {
+ expires = next->expires;
min = delta;
+ type = i;
+ }
}
- if (min.tv64 == 0)
+ if (min == 0)
return 0;
if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
@@ -258,6 +268,8 @@ static int alarmtimer_suspend(struct device *dev)
return -EBUSY;
}
+ trace_alarmtimer_suspend(expires, type);
+
/* Setup an rtc timer to fire that far in the future */
rtc_timer_cancel(rtc, &rtctimer);
rtc_read_time(rtc, &tm);
@@ -265,7 +277,7 @@ static int alarmtimer_suspend(struct device *dev)
now = ktime_add(now, min);
/* Set alarm, if in the past reject suspend briefly to handle */
- ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
+ ret = rtc_timer_start(rtc, &rtctimer, now, 0);
if (ret < 0)
__pm_wakeup_event(ws, MSEC_PER_SEC);
return ret;
@@ -295,15 +307,32 @@ static int alarmtimer_resume(struct device *dev)
static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
{
- ktime_t delta;
+ struct alarm_base *base;
unsigned long flags;
- struct alarm_base *base = &alarm_bases[type];
+ ktime_t delta;
+
+ switch(type) {
+ case ALARM_REALTIME:
+ base = &alarm_bases[ALARM_REALTIME];
+ type = ALARM_REALTIME_FREEZER;
+ break;
+ case ALARM_BOOTTIME:
+ base = &alarm_bases[ALARM_BOOTTIME];
+ type = ALARM_BOOTTIME_FREEZER;
+ break;
+ default:
+ WARN_ONCE(1, "Invalid alarm type: %d\n", type);
+ return;
+ }
delta = ktime_sub(absexp, base->gettime());
spin_lock_irqsave(&freezer_delta_lock, flags);
- if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64))
+ if (!freezer_delta || (delta < freezer_delta)) {
freezer_delta = delta;
+ freezer_expires = absexp;
+ freezer_alarmtype = type;
+ }
spin_unlock_irqrestore(&freezer_delta_lock, flags);
}
@@ -342,6 +371,8 @@ void alarm_start(struct alarm *alarm, ktime_t start)
alarmtimer_enqueue(base, alarm);
hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
spin_unlock_irqrestore(&base->lock, flags);
+
+ trace_alarmtimer_start(alarm, base->gettime());
}
EXPORT_SYMBOL_GPL(alarm_start);
@@ -390,6 +421,8 @@ int alarm_try_to_cancel(struct alarm *alarm)
if (ret >= 0)
alarmtimer_dequeue(base, alarm);
spin_unlock_irqrestore(&base->lock, flags);
+
+ trace_alarmtimer_cancel(alarm, base->gettime());
return ret;
}
EXPORT_SYMBOL_GPL(alarm_try_to_cancel);
@@ -420,10 +453,10 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
delta = ktime_sub(now, alarm->node.expires);
- if (delta.tv64 < 0)
+ if (delta < 0)
return 0;
- if (unlikely(delta.tv64 >= interval.tv64)) {
+ if (unlikely(delta >= interval)) {
s64 incr = ktime_to_ns(interval);
overrun = ktime_divns(delta, incr);
@@ -431,7 +464,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
alarm->node.expires = ktime_add_ns(alarm->node.expires,
incr*overrun);
- if (alarm->node.expires.tv64 > now.tv64)
+ if (alarm->node.expires > now)
return overrun;
/*
* This (and the ktime_add() below) is the
@@ -483,12 +516,13 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
spin_lock_irqsave(&ptr->it_lock, flags);
if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) {
- if (posix_timer_event(ptr, 0) != 0)
+ if (IS_ENABLED(CONFIG_POSIX_TIMERS) &&
+ posix_timer_event(ptr, 0) != 0)
ptr->it_overrun++;
}
/* Re-add periodic timers */
- if (ptr->it.alarm.interval.tv64) {
+ if (ptr->it.alarm.interval) {
ptr->it_overrun += alarm_forward(alarm, now,
ptr->it.alarm.interval);
result = ALARMTIMER_RESTART;
@@ -542,7 +576,6 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
static int alarm_timer_create(struct k_itimer *new_timer)
{
enum alarmtimer_type type;
- struct alarm_base *base;
if (!alarmtimer_get_rtcdev())
return -ENOTSUPP;
@@ -551,7 +584,6 @@ static int alarm_timer_create(struct k_itimer *new_timer)
return -EPERM;
type = clock2alarm(new_timer->it_clock);
- base = &alarm_bases[type];
alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer);
return 0;
}
@@ -698,7 +730,7 @@ static int update_rmtp(ktime_t exp, enum alarmtimer_type type,
rem = ktime_sub(exp, alarm_bases[type].gettime());
- if (rem.tv64 <= 0)
+ if (rem <= 0)
return 0;
rmt = ktime_to_timespec(rem);
@@ -723,7 +755,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
struct alarm alarm;
int ret = 0;
- exp.tv64 = restart->nanosleep.expires;
+ exp = restart->nanosleep.expires;
alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
if (alarmtimer_do_nsleep(&alarm, exp))
@@ -803,7 +835,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
restart = &current->restart_block;
restart->fn = alarm_timer_nsleep_restart;
restart->nanosleep.clockid = type;
- restart->nanosleep.expires = exp.tv64;
+ restart->nanosleep.expires = exp;
restart->nanosleep.rmtp = rmtp;
ret = -ERESTART_RESTARTBLOCK;
@@ -848,8 +880,10 @@ static int __init alarmtimer_init(void)
alarmtimer_rtc_timer_init();
- posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
- posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
+ if (IS_ENABLED(CONFIG_POSIX_TIMERS)) {
+ posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
+ posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
+ }
/* Initialize alarm bases */
alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 2c5bc77c0bb0..97ac0951f164 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -179,7 +179,7 @@ void clockevents_switch_state(struct clock_event_device *dev,
void clockevents_shutdown(struct clock_event_device *dev)
{
clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
- dev->next_event.tv64 = KTIME_MAX;
+ dev->next_event = KTIME_MAX;
}
/**
@@ -213,7 +213,7 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
printk_deferred(KERN_WARNING
"CE: Reprogramming failure. Giving up\n");
- dev->next_event.tv64 = KTIME_MAX;
+ dev->next_event = KTIME_MAX;
return -ETIME;
}
@@ -310,7 +310,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
int64_t delta;
int rc;
- if (unlikely(expires.tv64 < 0)) {
+ if (unlikely(expires < 0)) {
WARN_ON_ONCE(1);
return -ETIME;
}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 6a5a310a1a53..93621ae718d3 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -89,6 +89,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
*mult = tmp;
*shift = sft;
}
+EXPORT_SYMBOL_GPL(clocks_calc_mult_shift);
/*[Clocksource internal variables]---------
* curr_clocksource:
@@ -140,6 +141,10 @@ static void __clocksource_unstable(struct clocksource *cs)
{
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
cs->flags |= CLOCK_SOURCE_UNSTABLE;
+
+ if (cs->mark_unstable)
+ cs->mark_unstable(cs);
+
if (finished_booting)
schedule_work(&watchdog_work);
}
@@ -169,7 +174,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
static void clocksource_watchdog(unsigned long data)
{
struct clocksource *cs;
- cycle_t csnow, wdnow, cslast, wdlast, delta;
+ u64 csnow, wdnow, cslast, wdlast, delta;
int64_t wd_nsec, cs_nsec;
int next_cpu, reset_pending;
@@ -600,9 +605,18 @@ static void __clocksource_select(bool skipcur)
*/
if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
/* Override clocksource cannot be used. */
- pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
- cs->name);
- override_name[0] = 0;
+ if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
+ pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
+ cs->name);
+ override_name[0] = 0;
+ } else {
+ /*
+ * The override cannot be currently verified.
+ * Deferring to let the watchdog check.
+ */
+ pr_info("Override clocksource %s is not currently HRT compatible - deferring\n",
+ cs->name);
+ }
} else
/* Override clocksource can be used. */
best = cs;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 9ba7c820fc23..8e11d8d9f419 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -50,7 +50,7 @@
#include <linux/timer.h>
#include <linux/freezer.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <trace/events/timer.h>
@@ -94,17 +94,15 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
};
static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
+ /* Make sure we catch unsupported clockids */
+ [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES,
+
[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
[CLOCK_TAI] = HRTIMER_BASE_TAI,
};
-static inline int hrtimer_clockid_to_base(clockid_t clock_id)
-{
- return hrtimer_clock_to_base_table[clock_id];
-}
-
/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
@@ -171,7 +169,7 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
return 0;
expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
- return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
+ return expires <= new_base->cpu_base->expires_next;
#else
return 0;
#endif
@@ -307,13 +305,13 @@ EXPORT_SYMBOL_GPL(__ktime_divns);
*/
ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
{
- ktime_t res = ktime_add(lhs, rhs);
+ ktime_t res = ktime_add_unsafe(lhs, rhs);
/*
* We use KTIME_SEC_MAX here, the maximum timeout which we can
* return to user space in a timespec:
*/
- if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64)
+ if (res < 0 || res < lhs || res < rhs)
res = ktime_set(KTIME_SEC_MAX, 0);
return res;
@@ -465,8 +463,8 @@ static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
{
struct hrtimer_clock_base *base = cpu_base->clock_base;
- ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
unsigned int active = cpu_base->active_bases;
+ ktime_t expires, expires_next = KTIME_MAX;
hrtimer_update_next_timer(cpu_base, NULL);
for (; active; base++, active >>= 1) {
@@ -479,7 +477,7 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
next = timerqueue_getnext(&base->active);
timer = container_of(next, struct hrtimer, node);
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
- if (expires.tv64 < expires_next.tv64) {
+ if (expires < expires_next) {
expires_next = expires;
hrtimer_update_next_timer(cpu_base, timer);
}
@@ -489,8 +487,8 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
* the clock bases so the result might be negative. Fix it up
* to prevent a false positive in clockevents_program_event().
*/
- if (expires_next.tv64 < 0)
- expires_next.tv64 = 0;
+ if (expires_next < 0)
+ expires_next = 0;
return expires_next;
}
#endif
@@ -561,10 +559,10 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
expires_next = __hrtimer_get_next_event(cpu_base);
- if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
+ if (skip_equal && expires_next == cpu_base->expires_next)
return;
- cpu_base->expires_next.tv64 = expires_next.tv64;
+ cpu_base->expires_next = expires_next;
/*
* If a hang was detected in the last timer interrupt then we
@@ -622,10 +620,10 @@ static void hrtimer_reprogram(struct hrtimer *timer,
* CLOCK_REALTIME timer might be requested with an absolute
* expiry time which is less than base->offset. Set it to 0.
*/
- if (expires.tv64 < 0)
- expires.tv64 = 0;
+ if (expires < 0)
+ expires = 0;
- if (expires.tv64 >= cpu_base->expires_next.tv64)
+ if (expires >= cpu_base->expires_next)
return;
/* Update the pointer to the next expiring timer */
@@ -653,7 +651,7 @@ static void hrtimer_reprogram(struct hrtimer *timer,
*/
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
{
- base->expires_next.tv64 = KTIME_MAX;
+ base->expires_next = KTIME_MAX;
base->hres_active = 0;
}
@@ -703,7 +701,7 @@ static void clock_was_set_work(struct work_struct *work)
static DECLARE_WORK(hrtimer_work, clock_was_set_work);
/*
- * Called from timekeeping and resume code to reprogramm the hrtimer
+ * Called from timekeeping and resume code to reprogram the hrtimer
* interrupt device on all cpus.
*/
void clock_was_set_delayed(void)
@@ -766,34 +764,6 @@ void hrtimers_resume(void)
clock_was_set_delayed();
}
-static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
-{
-#ifdef CONFIG_TIMER_STATS
- if (timer->start_site)
- return;
- timer->start_site = __builtin_return_address(0);
- memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
- timer->start_pid = current->pid;
-#endif
-}
-
-static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
-{
-#ifdef CONFIG_TIMER_STATS
- timer->start_site = NULL;
-#endif
-}
-
-static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
-{
-#ifdef CONFIG_TIMER_STATS
- if (likely(!timer_stats_active))
- return;
- timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
- timer->function, timer->start_comm, 0);
-#endif
-}
-
/*
* Counterpart to lock_hrtimer_base above:
*/
@@ -827,21 +797,21 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
delta = ktime_sub(now, hrtimer_get_expires(timer));
- if (delta.tv64 < 0)
+ if (delta < 0)
return 0;
if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
return 0;
- if (interval.tv64 < hrtimer_resolution)
- interval.tv64 = hrtimer_resolution;
+ if (interval < hrtimer_resolution)
+ interval = hrtimer_resolution;
- if (unlikely(delta.tv64 >= interval.tv64)) {
+ if (unlikely(delta >= interval)) {
s64 incr = ktime_to_ns(interval);
orun = ktime_divns(delta, incr);
hrtimer_add_expires_ns(timer, incr * orun);
- if (hrtimer_get_expires_tv64(timer) > now.tv64)
+ if (hrtimer_get_expires_tv64(timer) > now)
return orun;
/*
* This (and the ktime_add() below) is the
@@ -932,7 +902,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
* rare case and less expensive than a smp call.
*/
debug_deactivate(timer);
- timer_stats_hrtimer_clear_start_info(timer);
reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
if (!restart)
@@ -955,7 +924,7 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
*/
timer->is_rel = mode & HRTIMER_MODE_REL;
if (timer->is_rel)
- tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
+ tim = ktime_add_safe(tim, hrtimer_resolution);
#endif
return tim;
}
@@ -990,8 +959,6 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
/* Switch the timer base, if necessary: */
new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
- timer_stats_hrtimer_set_start_info(timer);
-
leftmost = enqueue_hrtimer(timer, new_base);
if (!leftmost)
goto unlock;
@@ -1104,7 +1071,7 @@ u64 hrtimer_get_next_event(void)
raw_spin_lock_irqsave(&cpu_base->lock, flags);
if (!__hrtimer_hres_active(cpu_base))
- expires = __hrtimer_get_next_event(cpu_base).tv64;
+ expires = __hrtimer_get_next_event(cpu_base);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1112,6 +1079,18 @@ u64 hrtimer_get_next_event(void)
}
#endif
+static inline int hrtimer_clockid_to_base(clockid_t clock_id)
+{
+ if (likely(clock_id < MAX_CLOCKS)) {
+ int base = hrtimer_clock_to_base_table[clock_id];
+
+ if (likely(base != HRTIMER_MAX_CLOCK_BASES))
+ return base;
+ }
+ WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
+ return HRTIMER_BASE_MONOTONIC;
+}
+
static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
{
@@ -1128,12 +1107,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
base = hrtimer_clockid_to_base(clock_id);
timer->base = &cpu_base->clock_base[base];
timerqueue_init(&timer->node);
-
-#ifdef CONFIG_TIMER_STATS
- timer->start_site = NULL;
- timer->start_pid = -1;
- memset(timer->start_comm, 0, TASK_COMM_LEN);
-#endif
}
/**
@@ -1217,7 +1190,6 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
raw_write_seqcount_barrier(&cpu_base->seq);
__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
- timer_stats_account_hrtimer(timer);
fn = timer->function;
/*
@@ -1241,7 +1213,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
/*
* Note: We clear the running state after enqueue_hrtimer and
- * we do not reprogramm the event hardware. Happens either in
+ * we do not reprogram the event hardware. Happens either in
* hrtimer_start_range_ns() or in hrtimer_interrupt()
*
* Note: Because we dropped the cpu_base->lock above,
@@ -1296,7 +1268,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
* are right-of a not yet expired timer, because that
* timer will have to trigger a wakeup anyway.
*/
- if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
+ if (basenow < hrtimer_get_softexpires_tv64(timer))
break;
__run_hrtimer(cpu_base, base, timer, &basenow);
@@ -1318,7 +1290,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
BUG_ON(!cpu_base->hres_active);
cpu_base->nr_events++;
- dev->next_event.tv64 = KTIME_MAX;
+ dev->next_event = KTIME_MAX;
raw_spin_lock(&cpu_base->lock);
entry_time = now = hrtimer_update_base(cpu_base);
@@ -1331,7 +1303,7 @@ retry:
* timers which run their callback and need to be requeued on
* this CPU.
*/
- cpu_base->expires_next.tv64 = KTIME_MAX;
+ cpu_base->expires_next = KTIME_MAX;
__hrtimer_run_queues(cpu_base, now);
@@ -1379,13 +1351,13 @@ retry:
cpu_base->hang_detected = 1;
raw_spin_unlock(&cpu_base->lock);
delta = ktime_sub(now, entry_time);
- if ((unsigned int)delta.tv64 > cpu_base->max_hang_time)
- cpu_base->max_hang_time = (unsigned int) delta.tv64;
+ if ((unsigned int)delta > cpu_base->max_hang_time)
+ cpu_base->max_hang_time = (unsigned int) delta;
/*
* Limit it to a sensible value as we enforce a longer
* delay. Give the CPU at least 100ms to catch up.
*/
- if (delta.tv64 > 100 * NSEC_PER_MSEC)
+ if (delta > 100 * NSEC_PER_MSEC)
expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
else
expires_next = ktime_add(now, delta);
@@ -1495,7 +1467,7 @@ static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
ktime_t rem;
rem = hrtimer_expires_remaining(timer);
- if (rem.tv64 <= 0)
+ if (rem <= 0)
return 0;
rmt = ktime_to_timespec(rem);
@@ -1693,7 +1665,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
* Optimize when a zero timeout value is given. It does not
* matter whether this is an absolute or a relative time.
*/
- if (expires && !expires->tv64) {
+ if (expires && *expires == 0) {
__set_current_state(TASK_RUNNING);
return 0;
}
@@ -1742,15 +1714,19 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
* You can set the task state as follows -
*
* %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
- * pass before the routine returns.
+ * pass before the routine returns unless the current task is explicitly
+ * woken up, (e.g. by wake_up_process()).
*
* %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task.
+ * delivered to the current task or the current task is explicitly woken
+ * up.
*
* The current task state is guaranteed to be TASK_RUNNING when this
* routine returns.
*
- * Returns 0 when the timer has expired otherwise -EINTR
+ * Returns 0 when the timer has expired. If the task was woken before the
+ * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
+ * by an explicit wakeup, it returns -EINTR.
*/
int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
const enum hrtimer_mode mode)
@@ -1772,15 +1748,19 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
* You can set the task state as follows -
*
* %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
- * pass before the routine returns.
+ * pass before the routine returns unless the current task is explicitly
+ * woken up, (e.g. by wake_up_process()).
*
* %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task.
+ * delivered to the current task or the current task is explicitly woken
+ * up.
*
* The current task state is guaranteed to be TASK_RUNNING when this
* routine returns.
*
- * Returns 0 when the timer has expired otherwise -EINTR
+ * Returns 0 when the timer has expired. If the task was woken before the
+ * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
+ * by an explicit wakeup, it returns -EINTR.
*/
int __sched schedule_hrtimeout(ktime_t *expires,
const enum hrtimer_mode mode)
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 1d5c7204ddc9..a95f13c31464 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -14,7 +14,7 @@
#include <linux/hrtimer.h>
#include <trace/events/timer.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/**
* itimer_get_remtime - get remaining time for the timer
@@ -34,10 +34,10 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
* then we return 0 - which is correct.
*/
if (hrtimer_active(timer)) {
- if (rem.tv64 <= 0)
- rem.tv64 = NSEC_PER_USEC;
+ if (rem <= 0)
+ rem = NSEC_PER_USEC;
} else
- rem.tv64 = 0;
+ rem = 0;
return ktime_to_timeval(rem);
}
@@ -45,16 +45,16 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
struct itimerval *const value)
{
- cputime_t cval, cinterval;
+ u64 val, interval;
struct cpu_itimer *it = &tsk->signal->it[clock_id];
spin_lock_irq(&tsk->sighand->siglock);
- cval = it->expires;
- cinterval = it->incr;
- if (cval) {
+ val = it->expires;
+ interval = it->incr;
+ if (val) {
struct task_cputime cputime;
- cputime_t t;
+ u64 t;
thread_group_cputimer(tsk, &cputime);
if (clock_id == CPUCLOCK_PROF)
@@ -63,17 +63,17 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
/* CPUCLOCK_VIRT */
t = cputime.utime;
- if (cval < t)
+ if (val < t)
/* about to fire */
- cval = cputime_one_jiffy;
+ val = TICK_NSEC;
else
- cval = cval - t;
+ val -= t;
}
spin_unlock_irq(&tsk->sighand->siglock);
- cputime_to_timeval(cval, &value->it_value);
- cputime_to_timeval(cinterval, &value->it_interval);
+ value->it_value = ns_to_timeval(val);
+ value->it_interval = ns_to_timeval(interval);
}
int do_getitimer(int which, struct itimerval *value)
@@ -129,55 +129,35 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns)
-{
- struct timespec ts;
- s64 cpu_ns;
-
- cputime_to_timespec(ct, &ts);
- cpu_ns = timespec_to_ns(&ts);
-
- return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns;
-}
-
static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
const struct itimerval *const value,
struct itimerval *const ovalue)
{
- cputime_t cval, nval, cinterval, ninterval;
- s64 ns_ninterval, ns_nval;
- u32 error, incr_error;
+ u64 oval, nval, ointerval, ninterval;
struct cpu_itimer *it = &tsk->signal->it[clock_id];
- nval = timeval_to_cputime(&value->it_value);
- ns_nval = timeval_to_ns(&value->it_value);
- ninterval = timeval_to_cputime(&value->it_interval);
- ns_ninterval = timeval_to_ns(&value->it_interval);
-
- error = cputime_sub_ns(nval, ns_nval);
- incr_error = cputime_sub_ns(ninterval, ns_ninterval);
+ nval = timeval_to_ns(&value->it_value);
+ ninterval = timeval_to_ns(&value->it_interval);
spin_lock_irq(&tsk->sighand->siglock);
- cval = it->expires;
- cinterval = it->incr;
- if (cval || nval) {
+ oval = it->expires;
+ ointerval = it->incr;
+ if (oval || nval) {
if (nval > 0)
- nval += cputime_one_jiffy;
- set_process_cpu_timer(tsk, clock_id, &nval, &cval);
+ nval += TICK_NSEC;
+ set_process_cpu_timer(tsk, clock_id, &nval, &oval);
}
it->expires = nval;
it->incr = ninterval;
- it->error = error;
- it->incr_error = incr_error;
trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
spin_unlock_irq(&tsk->sighand->siglock);
if (ovalue) {
- cputime_to_timeval(cval, &ovalue->it_value);
- cputime_to_timeval(cinterval, &ovalue->it_interval);
+ ovalue->it_value = ns_to_timeval(oval);
+ ovalue->it_interval = ns_to_timeval(ointerval);
}
}
@@ -216,12 +196,12 @@ again:
goto again;
}
expires = timeval_to_ktime(value->it_value);
- if (expires.tv64 != 0) {
+ if (expires != 0) {
tsk->signal->it_real_incr =
timeval_to_ktime(value->it_interval);
hrtimer_start(timer, expires, HRTIMER_MODE_REL);
} else
- tsk->signal->it_real_incr.tv64 = 0;
+ tsk->signal->it_real_incr = 0;
trace_itimer_state(ITIMER_REAL, value, 0);
spin_unlock_irq(&tsk->sighand->siglock);
@@ -238,6 +218,8 @@ again:
return 0;
}
+#ifdef __ARCH_WANT_SYS_ALARM
+
/**
* alarm_setitimer - set alarm in seconds
*
@@ -250,7 +232,7 @@ again:
* On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid
* negative timeval settings which would cause immediate expiry.
*/
-unsigned int alarm_setitimer(unsigned int seconds)
+static unsigned int alarm_setitimer(unsigned int seconds)
{
struct itimerval it_new, it_old;
@@ -275,6 +257,17 @@ unsigned int alarm_setitimer(unsigned int seconds)
return it_old.it_value.tv_sec;
}
+/*
+ * For backwards compatibility? This can be done in libc so Alpha
+ * and all newer ports shouldn't need it.
+ */
+SYSCALL_DEFINE1(alarm, unsigned int, seconds)
+{
+ return alarm_setitimer(seconds);
+}
+
+#endif
+
SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
struct itimerval __user *, ovalue)
{
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 555e21f7b966..7906b3f0c41a 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -27,19 +27,8 @@
#include "timekeeping.h"
-/* The Jiffies based clocksource is the lowest common
- * denominator clock source which should function on
- * all systems. It has the same coarse resolution as
- * the timer interrupt frequency HZ and it suffers
- * inaccuracies caused by missed or lost timer
- * interrupts and the inability for the timer
- * interrupt hardware to accuratly tick at the
- * requested HZ value. It is also not recommended
- * for "tick-less" systems.
- */
-#define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ)
-/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
+/* Since jiffies uses a simple TICK_NSEC multiplier
* conversion, the .shift value could be zero. However
* this would make NTP adjustments impossible as they are
* in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
@@ -47,8 +36,8 @@
* amount, and give ntp adjustments in units of 1/2^8
*
* The value 8 is somewhat carefully chosen, as anything
- * larger can result in overflows. NSEC_PER_JIFFY grows as
- * HZ shrinks, so values greater than 8 overflow 32bits when
+ * larger can result in overflows. TICK_NSEC grows as HZ
+ * shrinks, so values greater than 8 overflow 32bits when
* HZ=100.
*/
#if HZ < 34
@@ -59,17 +48,28 @@
#define JIFFIES_SHIFT 8
#endif
-static cycle_t jiffies_read(struct clocksource *cs)
+static u64 jiffies_read(struct clocksource *cs)
{
- return (cycle_t) jiffies;
+ return (u64) jiffies;
}
+/*
+ * The Jiffies based clocksource is the lowest common
+ * denominator clock source which should function on
+ * all systems. It has the same coarse resolution as
+ * the timer interrupt frequency HZ and it suffers
+ * inaccuracies caused by missed or lost timer
+ * interrupts and the inability for the timer
+ * interrupt hardware to accuratly tick at the
+ * requested HZ value. It is also not recommended
+ * for "tick-less" systems.
+ */
static struct clocksource clocksource_jiffies = {
.name = "jiffies",
.rating = 1, /* lowest valid rating*/
.read = jiffies_read,
.mask = CLOCKSOURCE_MASK(32),
- .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
+ .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */
.shift = JIFFIES_SHIFT,
.max_cycles = 10,
};
@@ -125,7 +125,7 @@ int register_refined_jiffies(long cycles_per_second)
shift_hz += cycles_per_tick/2;
do_div(shift_hz, cycles_per_tick);
/* Calculate nsec_per_tick using shift_hz */
- nsec_per_tick = (u64)NSEC_PER_SEC << 8;
+ nsec_per_tick = (u64)TICK_NSEC << 8;
nsec_per_tick += (u32)shift_hz/2;
do_div(nsec_per_tick, (u32)shift_hz);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 6df8927c58a5..edf19cc53140 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -381,7 +381,7 @@ ktime_t ntp_get_next_leap(void)
if ((time_state == TIME_INS) && (time_status & STA_INS))
return ktime_set(ntp_next_leap_sec, 0);
- ret.tv64 = KTIME_MAX;
+ ret = KTIME_MAX;
return ret;
}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 39008d78927a..b4377a5e4269 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -6,10 +6,9 @@
#include <linux/posix-timers.h>
#include <linux/errno.h>
#include <linux/math64.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/kernel_stat.h>
#include <trace/events/timer.h>
-#include <linux/random.h>
#include <linux/tick.h>
#include <linux/workqueue.h>
@@ -21,10 +20,10 @@
*/
void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
{
- cputime_t cputime = secs_to_cputime(rlim_new);
+ u64 nsecs = rlim_new * NSEC_PER_SEC;
spin_lock_irq(&task->sighand->siglock);
- set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
+ set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL);
spin_unlock_irq(&task->sighand->siglock);
}
@@ -51,39 +50,14 @@ static int check_clock(const clockid_t which_clock)
return error;
}
-static inline unsigned long long
-timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
-{
- unsigned long long ret;
-
- ret = 0; /* high half always zero when .cpu used */
- if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
- ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
- } else {
- ret = cputime_to_expires(timespec_to_cputime(tp));
- }
- return ret;
-}
-
-static void sample_to_timespec(const clockid_t which_clock,
- unsigned long long expires,
- struct timespec *tp)
-{
- if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
- *tp = ns_to_timespec(expires);
- else
- cputime_to_timespec((__force cputime_t)expires, tp);
-}
-
/*
* Update expiry time from increment, and increase overrun count,
* given the current clock sample.
*/
-static void bump_cpu_timer(struct k_itimer *timer,
- unsigned long long now)
+static void bump_cpu_timer(struct k_itimer *timer, u64 now)
{
int i;
- unsigned long long delta, incr;
+ u64 delta, incr;
if (timer->it.cpu.incr == 0)
return;
@@ -123,21 +97,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime)
return 0;
}
-static inline unsigned long long prof_ticks(struct task_struct *p)
+static inline u64 prof_ticks(struct task_struct *p)
{
- cputime_t utime, stime;
+ u64 utime, stime;
task_cputime(p, &utime, &stime);
- return cputime_to_expires(utime + stime);
+ return utime + stime;
}
-static inline unsigned long long virt_ticks(struct task_struct *p)
+static inline u64 virt_ticks(struct task_struct *p)
{
- cputime_t utime;
+ u64 utime, stime;
- task_cputime(p, &utime, NULL);
+ task_cputime(p, &utime, &stime);
- return cputime_to_expires(utime);
+ return utime;
}
static int
@@ -177,8 +151,8 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
/*
* Sample a per-thread clock for the given task.
*/
-static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
- unsigned long long *sample)
+static int cpu_clock_sample(const clockid_t which_clock,
+ struct task_struct *p, u64 *sample)
{
switch (CPUCLOCK_WHICH(which_clock)) {
default:
@@ -261,7 +235,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
*/
static int cpu_clock_sample_group(const clockid_t which_clock,
struct task_struct *p,
- unsigned long long *sample)
+ u64 *sample)
{
struct task_cputime cputime;
@@ -270,11 +244,11 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
return -EINVAL;
case CPUCLOCK_PROF:
thread_group_cputime(p, &cputime);
- *sample = cputime_to_expires(cputime.utime + cputime.stime);
+ *sample = cputime.utime + cputime.stime;
break;
case CPUCLOCK_VIRT:
thread_group_cputime(p, &cputime);
- *sample = cputime_to_expires(cputime.utime);
+ *sample = cputime.utime;
break;
case CPUCLOCK_SCHED:
thread_group_cputime(p, &cputime);
@@ -289,7 +263,7 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
struct timespec *tp)
{
int err = -EINVAL;
- unsigned long long rtn;
+ u64 rtn;
if (CPUCLOCK_PERTHREAD(which_clock)) {
if (same_thread_group(tsk, current))
@@ -300,7 +274,7 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
}
if (!err)
- sample_to_timespec(which_clock, rtn, tp);
+ *tp = ns_to_timespec(rtn);
return err;
}
@@ -447,17 +421,14 @@ static void cleanup_timers(struct list_head *head)
*/
void posix_cpu_timers_exit(struct task_struct *tsk)
{
- add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
- sizeof(unsigned long long));
cleanup_timers(tsk->cpu_timers);
-
}
void posix_cpu_timers_exit_group(struct task_struct *tsk)
{
cleanup_timers(tsk->signal->cpu_timers);
}
-static inline int expires_gt(cputime_t expires, cputime_t new_exp)
+static inline int expires_gt(u64 expires, u64 new_exp)
{
return expires == 0 || expires > new_exp;
}
@@ -492,7 +463,7 @@ static void arm_timer(struct k_itimer *timer)
list_add(&nt->entry, listpos);
if (listpos == head) {
- unsigned long long exp = nt->expires;
+ u64 exp = nt->expires;
/*
* We are the new earliest-expiring POSIX 1.b timer, hence
@@ -503,16 +474,15 @@ static void arm_timer(struct k_itimer *timer)
switch (CPUCLOCK_WHICH(timer->it_clock)) {
case CPUCLOCK_PROF:
- if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp)))
- cputime_expires->prof_exp = expires_to_cputime(exp);
+ if (expires_gt(cputime_expires->prof_exp, exp))
+ cputime_expires->prof_exp = exp;
break;
case CPUCLOCK_VIRT:
- if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp)))
- cputime_expires->virt_exp = expires_to_cputime(exp);
+ if (expires_gt(cputime_expires->virt_exp, exp))
+ cputime_expires->virt_exp = exp;
break;
case CPUCLOCK_SCHED:
- if (cputime_expires->sched_exp == 0 ||
- cputime_expires->sched_exp > exp)
+ if (expires_gt(cputime_expires->sched_exp, exp))
cputime_expires->sched_exp = exp;
break;
}
@@ -563,8 +533,7 @@ static void cpu_timer_fire(struct k_itimer *timer)
* traversal.
*/
static int cpu_timer_sample_group(const clockid_t which_clock,
- struct task_struct *p,
- unsigned long long *sample)
+ struct task_struct *p, u64 *sample)
{
struct task_cputime cputime;
@@ -573,10 +542,10 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
default:
return -EINVAL;
case CPUCLOCK_PROF:
- *sample = cputime_to_expires(cputime.utime + cputime.stime);
+ *sample = cputime.utime + cputime.stime;
break;
case CPUCLOCK_VIRT:
- *sample = cputime_to_expires(cputime.utime);
+ *sample = cputime.utime;
break;
case CPUCLOCK_SCHED:
*sample = cputime.sum_exec_runtime;
@@ -597,12 +566,12 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
unsigned long flags;
struct sighand_struct *sighand;
struct task_struct *p = timer->it.cpu.task;
- unsigned long long old_expires, new_expires, old_incr, val;
+ u64 old_expires, new_expires, old_incr, val;
int ret;
WARN_ON_ONCE(p == NULL);
- new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
+ new_expires = timespec_to_ns(&new->it_value);
/*
* Protect against sighand release/switch in exit/exec and p->cpu_timers
@@ -663,9 +632,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
bump_cpu_timer(timer, val);
if (val < timer->it.cpu.expires) {
old_expires = timer->it.cpu.expires - val;
- sample_to_timespec(timer->it_clock,
- old_expires,
- &old->it_value);
+ old->it_value = ns_to_timespec(old_expires);
} else {
old->it_value.tv_nsec = 1;
old->it_value.tv_sec = 0;
@@ -703,8 +670,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
* Install the new reload setting, and
* set up the signal and overrun bookkeeping.
*/
- timer->it.cpu.incr = timespec_to_sample(timer->it_clock,
- &new->it_interval);
+ timer->it.cpu.incr = timespec_to_ns(&new->it_interval);
/*
* This acts as a modification timestamp for the timer,
@@ -727,17 +693,15 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
ret = 0;
out:
- if (old) {
- sample_to_timespec(timer->it_clock,
- old_incr, &old->it_interval);
- }
+ if (old)
+ old->it_interval = ns_to_timespec(old_incr);
return ret;
}
static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
{
- unsigned long long now;
+ u64 now;
struct task_struct *p = timer->it.cpu.task;
WARN_ON_ONCE(p == NULL);
@@ -745,8 +709,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
/*
* Easy part: convert the reload time.
*/
- sample_to_timespec(timer->it_clock,
- timer->it.cpu.incr, &itp->it_interval);
+ itp->it_interval = ns_to_timespec(timer->it.cpu.incr);
if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */
itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
@@ -775,8 +738,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
* Call the timer disarmed, nothing else to do.
*/
timer->it.cpu.expires = 0;
- sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
- &itp->it_value);
+ itp->it_value = ns_to_timespec(timer->it.cpu.expires);
return;
} else {
cpu_timer_sample_group(timer->it_clock, p, &now);
@@ -785,9 +747,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
}
if (now < timer->it.cpu.expires) {
- sample_to_timespec(timer->it_clock,
- timer->it.cpu.expires - now,
- &itp->it_value);
+ itp->it_value = ns_to_timespec(timer->it.cpu.expires - now);
} else {
/*
* The timer should have expired already, but the firing
@@ -831,7 +791,7 @@ static void check_thread_timers(struct task_struct *tsk,
struct list_head *timers = tsk->cpu_timers;
struct signal_struct *const sig = tsk->signal;
struct task_cputime *tsk_expires = &tsk->cputime_expires;
- unsigned long long expires;
+ u64 expires;
unsigned long soft;
/*
@@ -842,10 +802,10 @@ static void check_thread_timers(struct task_struct *tsk,
return;
expires = check_timers_list(timers, firing, prof_ticks(tsk));
- tsk_expires->prof_exp = expires_to_cputime(expires);
+ tsk_expires->prof_exp = expires;
expires = check_timers_list(++timers, firing, virt_ticks(tsk));
- tsk_expires->virt_exp = expires_to_cputime(expires);
+ tsk_expires->virt_exp = expires;
tsk_expires->sched_exp = check_timers_list(++timers, firing,
tsk->se.sum_exec_runtime);
@@ -894,26 +854,17 @@ static inline void stop_process_timers(struct signal_struct *sig)
tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER);
}
-static u32 onecputick;
-
static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
- unsigned long long *expires,
- unsigned long long cur_time, int signo)
+ u64 *expires, u64 cur_time, int signo)
{
if (!it->expires)
return;
if (cur_time >= it->expires) {
- if (it->incr) {
+ if (it->incr)
it->expires += it->incr;
- it->error += it->incr_error;
- if (it->error >= onecputick) {
- it->expires -= cputime_one_jiffy;
- it->error -= onecputick;
- }
- } else {
+ else
it->expires = 0;
- }
trace_itimer_expire(signo == SIGPROF ?
ITIMER_PROF : ITIMER_VIRTUAL,
@@ -921,9 +872,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
}
- if (it->expires && (!*expires || it->expires < *expires)) {
+ if (it->expires && (!*expires || it->expires < *expires))
*expires = it->expires;
- }
}
/*
@@ -935,8 +885,8 @@ static void check_process_timers(struct task_struct *tsk,
struct list_head *firing)
{
struct signal_struct *const sig = tsk->signal;
- unsigned long long utime, ptime, virt_expires, prof_expires;
- unsigned long long sum_sched_runtime, sched_expires;
+ u64 utime, ptime, virt_expires, prof_expires;
+ u64 sum_sched_runtime, sched_expires;
struct list_head *timers = sig->cpu_timers;
struct task_cputime cputime;
unsigned long soft;
@@ -958,8 +908,8 @@ static void check_process_timers(struct task_struct *tsk,
* Collect the current process totals.
*/
thread_group_cputimer(tsk, &cputime);
- utime = cputime_to_expires(cputime.utime);
- ptime = utime + cputime_to_expires(cputime.stime);
+ utime = cputime.utime;
+ ptime = utime + cputime.stime;
sum_sched_runtime = cputime.sum_exec_runtime;
prof_expires = check_timers_list(timers, firing, ptime);
@@ -975,10 +925,10 @@ static void check_process_timers(struct task_struct *tsk,
SIGVTALRM);
soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (soft != RLIM_INFINITY) {
- unsigned long psecs = cputime_to_secs(ptime);
+ unsigned long psecs = div_u64(ptime, NSEC_PER_SEC);
unsigned long hard =
READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
- cputime_t x;
+ u64 x;
if (psecs >= hard) {
/*
* At the hard limit, we just die.
@@ -997,14 +947,13 @@ static void check_process_timers(struct task_struct *tsk,
sig->rlim[RLIMIT_CPU].rlim_cur = soft;
}
}
- x = secs_to_cputime(soft);
- if (!prof_expires || x < prof_expires) {
+ x = soft * NSEC_PER_SEC;
+ if (!prof_expires || x < prof_expires)
prof_expires = x;
- }
}
- sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires);
- sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires);
+ sig->cputime_expires.prof_exp = prof_expires;
+ sig->cputime_expires.virt_exp = virt_expires;
sig->cputime_expires.sched_exp = sched_expires;
if (task_cputime_zero(&sig->cputime_expires))
stop_process_timers(sig);
@@ -1021,7 +970,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
struct sighand_struct *sighand;
unsigned long flags;
struct task_struct *p = timer->it.cpu.task;
- unsigned long long now;
+ u64 now;
WARN_ON_ONCE(p == NULL);
@@ -1218,9 +1167,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)
* The tsk->sighand->siglock must be held by the caller.
*/
void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
- cputime_t *newval, cputime_t *oldval)
+ u64 *newval, u64 *oldval)
{
- unsigned long long now;
+ u64 now;
WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
cpu_timer_sample_group(clock_idx, tsk, &now);
@@ -1234,7 +1183,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
if (*oldval) {
if (*oldval <= now) {
/* Just about to fire. */
- *oldval = cputime_one_jiffy;
+ *oldval = TICK_NSEC;
} else {
*oldval -= now;
}
@@ -1314,7 +1263,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
/*
* We were interrupted by a signal.
*/
- sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
+ *rqtp = ns_to_timespec(timer.it.cpu.expires);
error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
if (!error) {
/*
@@ -1480,15 +1429,10 @@ static __init int init_posix_cpu_timers(void)
.clock_get = thread_cpu_clock_get,
.timer_create = thread_cpu_timer_create,
};
- struct timespec ts;
posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
- cputime_to_timespec(cputime_one_jiffy, &ts);
- onecputick = ts.tv_nsec;
- WARN_ON(ts.tv_sec != 0);
-
return 0;
}
__initcall(init_posix_cpu_timers);
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
new file mode 100644
index 000000000000..cd6716e115e8
--- /dev/null
+++ b/kernel/time/posix-stubs.c
@@ -0,0 +1,123 @@
+/*
+ * Dummy stubs used when CONFIG_POSIX_TIMERS=n
+ *
+ * Created by: Nicolas Pitre, July 2016
+ * Copyright: (C) 2016 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/syscalls.h>
+#include <linux/ktime.h>
+#include <linux/timekeeping.h>
+#include <linux/posix-timers.h>
+
+asmlinkage long sys_ni_posix_timers(void)
+{
+ pr_err_once("process %d (%s) attempted a POSIX timer syscall "
+ "while CONFIG_POSIX_TIMERS is not set\n",
+ current->pid, current->comm);
+ return -ENOSYS;
+}
+
+#define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers)
+
+SYS_NI(timer_create);
+SYS_NI(timer_gettime);
+SYS_NI(timer_getoverrun);
+SYS_NI(timer_settime);
+SYS_NI(timer_delete);
+SYS_NI(clock_adjtime);
+SYS_NI(getitimer);
+SYS_NI(setitimer);
+#ifdef __ARCH_WANT_SYS_ALARM
+SYS_NI(alarm);
+#endif
+
+/*
+ * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC
+ * as it is easy to remain compatible with little code. CLOCK_BOOTTIME
+ * is also included for convenience as at least systemd uses it.
+ */
+
+SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
+ const struct timespec __user *, tp)
+{
+ struct timespec new_tp;
+
+ if (which_clock != CLOCK_REALTIME)
+ return -EINVAL;
+ if (copy_from_user(&new_tp, tp, sizeof (*tp)))
+ return -EFAULT;
+ return do_sys_settimeofday(&new_tp, NULL);
+}
+
+SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
+ struct timespec __user *,tp)
+{
+ struct timespec kernel_tp;
+
+ switch (which_clock) {
+ case CLOCK_REALTIME: ktime_get_real_ts(&kernel_tp); break;
+ case CLOCK_MONOTONIC: ktime_get_ts(&kernel_tp); break;
+ case CLOCK_BOOTTIME: get_monotonic_boottime(&kernel_tp); break;
+ default: return -EINVAL;
+ }
+ if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
+ return -EFAULT;
+ return 0;
+}
+
+SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp)
+{
+ struct timespec rtn_tp = {
+ .tv_sec = 0,
+ .tv_nsec = hrtimer_resolution,
+ };
+
+ switch (which_clock) {
+ case CLOCK_REALTIME:
+ case CLOCK_MONOTONIC:
+ case CLOCK_BOOTTIME:
+ if (copy_to_user(tp, &rtn_tp, sizeof(rtn_tp)))
+ return -EFAULT;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
+ const struct timespec __user *, rqtp,
+ struct timespec __user *, rmtp)
+{
+ struct timespec t;
+
+ switch (which_clock) {
+ case CLOCK_REALTIME:
+ case CLOCK_MONOTONIC:
+ case CLOCK_BOOTTIME:
+ if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
+ return -EFAULT;
+ if (!timespec_valid(&t))
+ return -EINVAL;
+ return hrtimer_nanosleep(&t, rmtp, flags & TIMER_ABSTIME ?
+ HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
+ which_clock);
+ default:
+ return -EINVAL;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+long clock_nanosleep_restart(struct restart_block *restart_block)
+{
+ return hrtimer_nanosleep_restart(restart_block);
+}
+#endif
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index f2826c35e918..1e6623d76750 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -36,7 +36,7 @@
#include <linux/time.h>
#include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/compiler.h>
@@ -359,7 +359,7 @@ static void schedule_next_timer(struct k_itimer *timr)
{
struct hrtimer *timer = &timr->it.real.timer;
- if (timr->it.real.interval.tv64 == 0)
+ if (timr->it.real.interval == 0)
return;
timr->it_overrun += (unsigned int) hrtimer_forward(timer,
@@ -449,7 +449,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
timr = container_of(timer, struct k_itimer, it.real.timer);
spin_lock_irqsave(&timr->it_lock, flags);
- if (timr->it.real.interval.tv64 != 0)
+ if (timr->it.real.interval != 0)
si_private = ++timr->it_requeue_pending;
if (posix_timer_event(timr, si_private)) {
@@ -458,7 +458,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
* we will not get a call back to restart it AND
* it should be restarted.
*/
- if (timr->it.real.interval.tv64 != 0) {
+ if (timr->it.real.interval != 0) {
ktime_t now = hrtimer_cb_get_time(timer);
/*
@@ -485,9 +485,9 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
*/
#ifdef CONFIG_HIGH_RES_TIMERS
{
- ktime_t kj = ktime_set(0, NSEC_PER_SEC / HZ);
+ ktime_t kj = NSEC_PER_SEC / HZ;
- if (timr->it.real.interval.tv64 < kj.tv64)
+ if (timr->it.real.interval < kj)
now = ktime_add(now, kj);
}
#endif
@@ -743,7 +743,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
iv = timr->it.real.interval;
/* interval timer ? */
- if (iv.tv64)
+ if (iv)
cur_setting->it_interval = ktime_to_timespec(iv);
else if (!hrtimer_active(timer) &&
(timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
@@ -756,13 +756,13 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
* timer move the expiry time forward by intervals, so
* expiry is > now.
*/
- if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
- (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
+ if (iv && (timr->it_requeue_pending & REQUEUE_PENDING ||
+ (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
remaining = __hrtimer_expires_remaining_adjusted(timer, now);
/* Return 0 only, when the timer is expired and not pending */
- if (remaining.tv64 <= 0) {
+ if (remaining <= 0) {
/*
* A single shot SIGEV_NONE timer must return 0, when
* it is expired !
@@ -839,7 +839,7 @@ common_timer_set(struct k_itimer *timr, int flags,
common_timer_get(timr, old_setting);
/* disable the timer */
- timr->it.real.interval.tv64 = 0;
+ timr->it.real.interval = 0;
/*
* careful here. If smp we could be in the "fire" routine which will
* be spinning as we hold the lock. But this is ONLY an SMP issue.
@@ -924,7 +924,7 @@ retry:
static int common_timer_del(struct k_itimer *timer)
{
- timer->it.real.interval.tv64 = 0;
+ timer->it.real.interval = 0;
if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0)
return TIMER_RETRY;
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 690b797f522e..a7bb8f33ae07 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -97,7 +97,7 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
if (clockevent_state_oneshot(&ce_broadcast_hrtimer))
- if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX)
+ if (ce_broadcast_hrtimer.next_event != KTIME_MAX)
return HRTIMER_RESTART;
return HRTIMER_NORESTART;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f6aae7977824..987e496bb51a 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -29,12 +29,13 @@
*/
static struct tick_device tick_broadcast_device;
-static cpumask_var_t tick_broadcast_mask;
-static cpumask_var_t tick_broadcast_on;
-static cpumask_var_t tmpmask;
-static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
+static cpumask_var_t tick_broadcast_mask __cpumask_var_read_mostly;
+static cpumask_var_t tick_broadcast_on __cpumask_var_read_mostly;
+static cpumask_var_t tmpmask __cpumask_var_read_mostly;
static int tick_broadcast_forced;
+static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
+
#ifdef CONFIG_TICK_ONESHOT
static void tick_broadcast_clear_oneshot(int cpu);
static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
@@ -347,17 +348,16 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
*
* Called when the system enters a state where affected tick devices
* might stop. Note: TICK_BROADCAST_FORCE cannot be undone.
- *
- * Called with interrupts disabled, so clockevents_lock is not
- * required here because the local clock event device cannot go away
- * under us.
*/
void tick_broadcast_control(enum tick_broadcast_mode mode)
{
struct clock_event_device *bc, *dev;
struct tick_device *td;
int cpu, bc_stopped;
+ unsigned long flags;
+ /* Protects also the local clockevent device. */
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
td = this_cpu_ptr(&tick_cpu_device);
dev = td->evtdev;
@@ -365,12 +365,11 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
* Is the device not affected by the powerstate ?
*/
if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
- return;
+ goto out;
if (!tick_device_is_functional(dev))
- return;
+ goto out;
- raw_spin_lock(&tick_broadcast_lock);
cpu = smp_processor_id();
bc = tick_broadcast_device.evtdev;
bc_stopped = cpumask_empty(tick_broadcast_mask);
@@ -420,7 +419,8 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
tick_broadcast_setup_oneshot(bc);
}
}
- raw_spin_unlock(&tick_broadcast_lock);
+out:
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
EXPORT_SYMBOL_GPL(tick_broadcast_control);
@@ -517,9 +517,9 @@ void tick_resume_broadcast(void)
#ifdef CONFIG_TICK_ONESHOT
-static cpumask_var_t tick_broadcast_oneshot_mask;
-static cpumask_var_t tick_broadcast_pending_mask;
-static cpumask_var_t tick_broadcast_force_mask;
+static cpumask_var_t tick_broadcast_oneshot_mask __cpumask_var_read_mostly;
+static cpumask_var_t tick_broadcast_pending_mask __cpumask_var_read_mostly;
+static cpumask_var_t tick_broadcast_force_mask __cpumask_var_read_mostly;
/*
* Exposed for debugging: see timer_list.c
@@ -604,14 +604,14 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
bool bc_local;
raw_spin_lock(&tick_broadcast_lock);
- dev->next_event.tv64 = KTIME_MAX;
- next_event.tv64 = KTIME_MAX;
+ dev->next_event = KTIME_MAX;
+ next_event = KTIME_MAX;
cpumask_clear(tmpmask);
now = ktime_get();
/* Find all expired events */
for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
td = &per_cpu(tick_cpu_device, cpu);
- if (td->evtdev->next_event.tv64 <= now.tv64) {
+ if (td->evtdev->next_event <= now) {
cpumask_set_cpu(cpu, tmpmask);
/*
* Mark the remote cpu in the pending mask, so
@@ -619,8 +619,8 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
* timer in tick_broadcast_oneshot_control().
*/
cpumask_set_cpu(cpu, tick_broadcast_pending_mask);
- } else if (td->evtdev->next_event.tv64 < next_event.tv64) {
- next_event.tv64 = td->evtdev->next_event.tv64;
+ } else if (td->evtdev->next_event < next_event) {
+ next_event = td->evtdev->next_event;
next_cpu = cpu;
}
}
@@ -657,7 +657,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
* - There are pending events on sleeping CPUs which were not
* in the event mask
*/
- if (next_event.tv64 != KTIME_MAX)
+ if (next_event != KTIME_MAX)
tick_broadcast_set_event(dev, next_cpu, next_event);
raw_spin_unlock(&tick_broadcast_lock);
@@ -672,7 +672,7 @@ static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu)
{
if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER))
return 0;
- if (bc->next_event.tv64 == KTIME_MAX)
+ if (bc->next_event == KTIME_MAX)
return 0;
return bc->bound_on == cpu ? -EBUSY : 0;
}
@@ -688,7 +688,7 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
if (bc->features & CLOCK_EVT_FEAT_HRTIMER) {
if (broadcast_needs_cpu(bc, smp_processor_id()))
return;
- if (dev->next_event.tv64 < bc->next_event.tv64)
+ if (dev->next_event < bc->next_event)
return;
}
clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
@@ -754,7 +754,7 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
*/
if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) {
ret = -EBUSY;
- } else if (dev->next_event.tv64 < bc->next_event.tv64) {
+ } else if (dev->next_event < bc->next_event) {
tick_broadcast_set_event(bc, cpu, dev->next_event);
/*
* In case of hrtimer broadcasts the
@@ -789,7 +789,7 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
/*
* Bail out if there is no next event.
*/
- if (dev->next_event.tv64 == KTIME_MAX)
+ if (dev->next_event == KTIME_MAX)
goto out;
/*
* If the pending bit is not set, then we are
@@ -824,7 +824,7 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
* nohz fixups.
*/
now = ktime_get();
- if (dev->next_event.tv64 <= now.tv64) {
+ if (dev->next_event <= now) {
cpumask_set_cpu(cpu, tick_broadcast_force_mask);
goto out;
}
@@ -871,6 +871,9 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
{
int cpu = smp_processor_id();
+ if (!bc)
+ return;
+
/* Set it up only once ! */
if (bc->event_handler != tick_handle_oneshot_broadcast) {
int was_periodic = clockevent_state_periodic(bc);
@@ -894,7 +897,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
tick_next_period);
tick_broadcast_set_event(bc, cpu, tick_next_period);
} else
- bc->next_event.tv64 = KTIME_MAX;
+ bc->next_event = KTIME_MAX;
} else {
/*
* The first cpu which switches to oneshot mode sets
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 4fcd99e12aa0..49edc1c4f3e6 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -178,8 +178,8 @@ static void tick_setup_device(struct tick_device *td,
struct clock_event_device *newdev, int cpu,
const struct cpumask *cpumask)
{
- ktime_t next_event;
void (*handler)(struct clock_event_device *) = NULL;
+ ktime_t next_event = 0;
/*
* First device setup ?
@@ -195,7 +195,7 @@ static void tick_setup_device(struct tick_device *td,
else
tick_do_timer_cpu = TICK_DO_TIMER_NONE;
tick_next_period = ktime_get();
- tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
+ tick_period = NSEC_PER_SEC / HZ;
}
/*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index b51344652330..6b009c207671 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -28,7 +28,7 @@ int tick_program_event(ktime_t expires, int force)
{
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
- if (unlikely(expires.tv64 == KTIME_MAX)) {
+ if (unlikely(expires == KTIME_MAX)) {
/*
* We don't need the clock event device any more, stop it.
*/
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2ec7c00228f3..2c115fdab397 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -58,21 +58,21 @@ static void tick_do_update_jiffies64(ktime_t now)
* Do a quick check without holding jiffies_lock:
*/
delta = ktime_sub(now, last_jiffies_update);
- if (delta.tv64 < tick_period.tv64)
+ if (delta < tick_period)
return;
/* Reevaluate with jiffies_lock held */
write_seqlock(&jiffies_lock);
delta = ktime_sub(now, last_jiffies_update);
- if (delta.tv64 >= tick_period.tv64) {
+ if (delta >= tick_period) {
delta = ktime_sub(delta, tick_period);
last_jiffies_update = ktime_add(last_jiffies_update,
tick_period);
/* Slow path for long timeouts */
- if (unlikely(delta.tv64 >= tick_period.tv64)) {
+ if (unlikely(delta >= tick_period)) {
s64 incr = ktime_to_ns(tick_period);
ticks = ktime_divns(delta, incr);
@@ -101,7 +101,7 @@ static ktime_t tick_init_jiffy_update(void)
write_seqlock(&jiffies_lock);
/* Did we start the jiffies update yet ? */
- if (last_jiffies_update.tv64 == 0)
+ if (last_jiffies_update == 0)
last_jiffies_update = tick_next_period;
period = last_jiffies_update;
write_sequnlock(&jiffies_lock);
@@ -186,10 +186,13 @@ static bool check_tick_dependency(atomic_t *dep)
return false;
}
-static bool can_stop_full_tick(struct tick_sched *ts)
+static bool can_stop_full_tick(int cpu, struct tick_sched *ts)
{
WARN_ON_ONCE(!irqs_disabled());
+ if (unlikely(!cpu_online(cpu)))
+ return false;
+
if (check_tick_dependency(&tick_dep_mask))
return false;
@@ -387,24 +390,16 @@ static int __init tick_nohz_full_setup(char *str)
}
__setup("nohz_full=", tick_nohz_full_setup);
-static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int tick_nohz_cpu_down(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DOWN_PREPARE:
- /*
- * The boot CPU handles housekeeping duty (unbound timers,
- * workqueues, timekeeping, ...) on behalf of full dynticks
- * CPUs. It must remain online when nohz full is enabled.
- */
- if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
- return NOTIFY_BAD;
- break;
- }
- return NOTIFY_OK;
+ /*
+ * The boot CPU handles housekeeping duty (unbound timers,
+ * workqueues, timekeeping, ...) on behalf of full dynticks
+ * CPUs. It must remain online when nohz full is enabled.
+ */
+ if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
+ return -EBUSY;
+ return 0;
}
static int tick_nohz_init_all(void)
@@ -425,7 +420,7 @@ static int tick_nohz_init_all(void)
void __init tick_nohz_init(void)
{
- int cpu;
+ int cpu, ret;
if (!tick_nohz_full_running) {
if (tick_nohz_init_all() < 0)
@@ -466,7 +461,10 @@ void __init tick_nohz_init(void)
for_each_cpu(cpu, tick_nohz_full_mask)
context_tracking_cpu_set(cpu);
- cpu_notifier(tick_nohz_cpu_down_callback, 0);
+ ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "kernel/nohz:predown", NULL,
+ tick_nohz_cpu_down);
+ WARN_ON(ret < 0);
pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
cpumask_pr_args(tick_nohz_full_mask));
@@ -671,7 +669,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
/* Read jiffies and the time when jiffies were updated last */
do {
seq = read_seqbegin(&jiffies_lock);
- basemono = last_jiffies_update.tv64;
+ basemono = last_jiffies_update;
basejiff = jiffies;
} while (read_seqretry(&jiffies_lock, seq));
ts->last_jiffies = basejiff;
@@ -699,7 +697,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
*/
delta = next_tick - basemono;
if (delta <= (u64)TICK_NSEC) {
- tick.tv64 = 0;
+ tick = 0;
/*
* Tell the timer code that the base is not idle, i.e. undo
@@ -766,10 +764,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
expires = KTIME_MAX;
expires = min_t(u64, expires, next_tick);
- tick.tv64 = expires;
+ tick = expires;
/* Skip reprogram of event if its not changed */
- if (ts->tick_stopped && (expires == dev->next_event.tv64))
+ if (ts->tick_stopped && (expires == dev->next_event))
goto out;
/*
@@ -843,7 +841,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
return;
- if (can_stop_full_tick(ts))
+ if (can_stop_full_tick(cpu, ts))
tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
else if (ts->tick_stopped)
tick_nohz_restart_sched_tick(ts, ktime_get());
@@ -866,7 +864,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
}
if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
- ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ };
+ ts->sleep_length = NSEC_PER_SEC / HZ;
return false;
}
@@ -916,7 +914,7 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
ts->idle_calls++;
expires = tick_nohz_stop_sched_tick(ts, now, cpu);
- if (expires.tv64 > 0LL) {
+ if (expires > 0LL) {
ts->idle_sleeps++;
ts->idle_expires = expires;
}
@@ -1053,7 +1051,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
- dev->next_event.tv64 = KTIME_MAX;
+ dev->next_event = KTIME_MAX;
tick_sched_do_timer(now);
tick_sched_handle(ts, regs);
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 667b9335f5d6..25bdd2504571 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -38,7 +38,7 @@
#include <linux/math64.h>
#include <linux/ptrace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <generated/timeconst.h>
@@ -702,6 +702,16 @@ u64 nsec_to_clock_t(u64 x)
#endif
}
+u64 jiffies64_to_nsecs(u64 j)
+{
+#if !(NSEC_PER_SEC % HZ)
+ return (NSEC_PER_SEC / HZ) * j;
+# else
+ return div_u64(j * HZ_TO_NSEC_NUM, HZ_TO_NSEC_DEN);
+#endif
+}
+EXPORT_SYMBOL(jiffies64_to_nsecs);
+
/**
* nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
*
@@ -780,7 +790,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
{
struct timespec64 res;
- set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec,
+ set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec,
lhs.tv_nsec + rhs.tv_nsec);
if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) {
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
index c48688904f9f..f83bbb81600b 100644
--- a/kernel/time/timeconst.bc
+++ b/kernel/time/timeconst.bc
@@ -98,6 +98,12 @@ define timeconst(hz) {
print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n"
print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n"
print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n"
+
+ cd=gcd(hz,1000000000)
+ print "#define HZ_TO_NSEC_NUM\t\t", 1000000000/cd, "\n"
+ print "#define HZ_TO_NSEC_DEN\t\t", hz/cd, "\n"
+ print "#define NSEC_TO_HZ_NUM\t\t", hz/cd, "\n"
+ print "#define NSEC_TO_HZ_DEN\t\t", 1000000000/cd, "\n"
print "\n"
print "#endif /* KERNEL_TIMECONST_H */\n"
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index 4687b3104bae..8afd78932bdf 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(timecounter_init);
*/
static u64 timecounter_read_delta(struct timecounter *tc)
{
- cycle_t cycle_now, cycle_delta;
+ u64 cycle_now, cycle_delta;
u64 ns_offset;
/* read cycle counter: */
@@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(timecounter_read);
* time previous to the time stored in the cycle counter.
*/
static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
- cycle_t cycles, u64 mask, u64 frac)
+ u64 cycles, u64 mask, u64 frac)
{
u64 ns = (u64) cycles;
@@ -90,7 +90,7 @@ static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
}
u64 timecounter_cyc2time(struct timecounter *tc,
- cycle_t cycle_tstamp)
+ u64 cycle_tstamp)
{
u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
u64 nsec = tc->nsec, frac = tc->frac;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e07fb093f819..95b258dd75db 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -104,7 +104,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
*/
set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
-tk->wall_to_monotonic.tv_nsec);
- WARN_ON_ONCE(tk->offs_real.tv64 != timespec64_to_ktime(tmp).tv64);
+ WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp));
tk->wall_to_monotonic = wtm;
set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
tk->offs_real = timespec64_to_ktime(tmp);
@@ -119,10 +119,10 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
#ifdef CONFIG_DEBUG_TIMEKEEPING
#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
-static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+static void timekeeping_check_update(struct timekeeper *tk, u64 offset)
{
- cycle_t max_cycles = tk->tkr_mono.clock->max_cycles;
+ u64 max_cycles = tk->tkr_mono.clock->max_cycles;
const char *name = tk->tkr_mono.clock->name;
if (offset > max_cycles) {
@@ -158,10 +158,10 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
}
}
-static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)
{
struct timekeeper *tk = &tk_core.timekeeper;
- cycle_t now, last, mask, max, delta;
+ u64 now, last, mask, max, delta;
unsigned int seq;
/*
@@ -199,12 +199,12 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
return delta;
}
#else
-static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset)
{
}
-static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)
{
- cycle_t cycle_now, delta;
+ u64 cycle_now, delta;
/* read clocksource */
cycle_now = tkr->read(tkr->clock);
@@ -229,7 +229,7 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
*/
static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
{
- cycle_t interval;
+ u64 interval;
u64 tmp, ntpinterval;
struct clocksource *old_clock;
@@ -254,14 +254,13 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
if (tmp == 0)
tmp = 1;
- interval = (cycle_t) tmp;
+ interval = (u64) tmp;
tk->cycle_interval = interval;
/* Go back from cycles -> shifted ns */
- tk->xtime_interval = (u64) interval * clock->mult;
+ tk->xtime_interval = interval * clock->mult;
tk->xtime_remainder = ntpinterval - tk->xtime_interval;
- tk->raw_interval =
- ((u64) interval * clock->mult) >> clock->shift;
+ tk->raw_interval = (interval * clock->mult) >> clock->shift;
/* if changing clocks, convert xtime_nsec shift units */
if (old_clock) {
@@ -299,10 +298,9 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
static inline u32 arch_gettimeoffset(void) { return 0; }
#endif
-static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
- cycle_t delta)
+static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr, u64 delta)
{
- s64 nsec;
+ u64 nsec;
nsec = delta * tkr->mult + tkr->xtime_nsec;
nsec >>= tkr->shift;
@@ -311,18 +309,17 @@ static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
return nsec + arch_gettimeoffset();
}
-static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
+static inline u64 timekeeping_get_ns(struct tk_read_base *tkr)
{
- cycle_t delta;
+ u64 delta;
delta = timekeeping_get_delta(tkr);
return timekeeping_delta_to_ns(tkr, delta);
}
-static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr,
- cycle_t cycles)
+static inline u64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, u64 cycles)
{
- cycle_t delta;
+ u64 delta;
/* calculate the delta since the last update_wall_time */
delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
@@ -403,8 +400,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
tkr = tkf->base + (seq & 0x01);
now = ktime_to_ns(tkr->base);
- now += clocksource_delta(tkr->read(tkr->clock),
- tkr->cycle_last, tkr->mask);
+ now += timekeeping_delta_to_ns(tkr,
+ clocksource_delta(
+ tkr->read(tkr->clock),
+ tkr->cycle_last,
+ tkr->mask));
} while (read_seqcount_retry(&tkf->seq, seq));
return now;
@@ -422,10 +422,39 @@ u64 ktime_get_raw_fast_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
+/**
+ * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
+ *
+ * To keep it NMI safe since we're accessing from tracing, we're not using a
+ * separate timekeeper with updates to monotonic clock and boot offset
+ * protected with seqlocks. This has the following minor side effects:
+ *
+ * (1) Its possible that a timestamp be taken after the boot offset is updated
+ * but before the timekeeper is updated. If this happens, the new boot offset
+ * is added to the old timekeeping making the clock appear to update slightly
+ * earlier:
+ * CPU 0 CPU 1
+ * timekeeping_inject_sleeptime64()
+ * __timekeeping_inject_sleeptime(tk, delta);
+ * timestamp();
+ * timekeeping_update(tk, TK_CLEAR_NTP...);
+ *
+ * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
+ * partially updated. Since the tk->offs_boot update is a rare event, this
+ * should be a rare occurrence which postprocessing should be able to handle.
+ */
+u64 notrace ktime_get_boot_fast_ns(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+
+ return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot));
+}
+EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
+
/* Suspend-time cycles value for halted fast timekeeper. */
-static cycle_t cycles_at_suspend;
+static u64 cycles_at_suspend;
-static cycle_t dummy_clock_read(struct clocksource *cs)
+static u64 dummy_clock_read(struct clocksource *cs)
{
return cycles_at_suspend;
}
@@ -542,7 +571,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
static inline void tk_update_leap_state(struct timekeeper *tk)
{
tk->next_leap_ktime = ntp_get_next_leap();
- if (tk->next_leap_ktime.tv64 != KTIME_MAX)
+ if (tk->next_leap_ktime != KTIME_MAX)
/* Convert to monotonic time */
tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
}
@@ -619,8 +648,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
static void timekeeping_forward_now(struct timekeeper *tk)
{
struct clocksource *clock = tk->tkr_mono.clock;
- cycle_t cycle_now, delta;
- s64 nsec;
+ u64 cycle_now, delta;
+ u64 nsec;
cycle_now = tk->tkr_mono.read(clock);
delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
@@ -649,7 +678,7 @@ int __getnstimeofday64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long seq;
- s64 nsecs = 0;
+ u64 nsecs;
do {
seq = read_seqcount_begin(&tk_core.seq);
@@ -689,7 +718,7 @@ ktime_t ktime_get(void)
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
ktime_t base;
- s64 nsecs;
+ u64 nsecs;
WARN_ON(timekeeping_suspended);
@@ -732,7 +761,7 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
ktime_t base, *offset = offsets[offs];
- s64 nsecs;
+ u64 nsecs;
WARN_ON(timekeeping_suspended);
@@ -776,7 +805,7 @@ ktime_t ktime_get_raw(void)
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
ktime_t base;
- s64 nsecs;
+ u64 nsecs;
do {
seq = read_seqcount_begin(&tk_core.seq);
@@ -801,8 +830,8 @@ void ktime_get_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 tomono;
- s64 nsec;
unsigned int seq;
+ u64 nsec;
WARN_ON(timekeeping_suspended);
@@ -890,9 +919,9 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
unsigned long seq;
ktime_t base_raw;
ktime_t base_real;
- s64 nsec_raw;
- s64 nsec_real;
- cycle_t now;
+ u64 nsec_raw;
+ u64 nsec_real;
+ u64 now;
WARN_ON_ONCE(timekeeping_suspended);
@@ -951,8 +980,8 @@ static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
* interval is partial_history_cycles.
*/
static int adjust_historical_crosststamp(struct system_time_snapshot *history,
- cycle_t partial_history_cycles,
- cycle_t total_history_cycles,
+ u64 partial_history_cycles,
+ u64 total_history_cycles,
bool discontinuity,
struct system_device_crosststamp *ts)
{
@@ -1016,7 +1045,7 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history,
/*
* cycle_between - true if test occurs chronologically between before and after
*/
-static bool cycle_between(cycle_t before, cycle_t test, cycle_t after)
+static bool cycle_between(u64 before, u64 test, u64 after)
{
if (test > before && test < after)
return true;
@@ -1046,10 +1075,10 @@ int get_device_system_crosststamp(int (*get_time_fn)
{
struct system_counterval_t system_counterval;
struct timekeeper *tk = &tk_core.timekeeper;
- cycle_t cycles, now, interval_start;
+ u64 cycles, now, interval_start;
unsigned int clock_was_set_seq = 0;
ktime_t base_real, base_raw;
- s64 nsec_real, nsec_raw;
+ u64 nsec_real, nsec_raw;
u8 cs_was_changed_seq;
unsigned long seq;
bool do_interp;
@@ -1107,7 +1136,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
* current interval
*/
if (do_interp) {
- cycle_t partial_history_cycles, total_history_cycles;
+ u64 partial_history_cycles, total_history_cycles;
bool discontinuity;
/*
@@ -1246,27 +1275,8 @@ error: /* even if we error out, we forwarded the time, so call update */
}
EXPORT_SYMBOL(timekeeping_inject_offset);
-
/**
- * timekeeping_get_tai_offset - Returns current TAI offset from UTC
- *
- */
-s32 timekeeping_get_tai_offset(void)
-{
- struct timekeeper *tk = &tk_core.timekeeper;
- unsigned int seq;
- s32 ret;
-
- do {
- seq = read_seqcount_begin(&tk_core.seq);
- ret = tk->tai_offset;
- } while (read_seqcount_retry(&tk_core.seq, seq));
-
- return ret;
-}
-
-/**
- * __timekeeping_set_tai_offset - Lock free worker function
+ * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
*
*/
static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
@@ -1276,24 +1286,6 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
}
/**
- * timekeeping_set_tai_offset - Sets the current TAI offset from UTC
- *
- */
-void timekeeping_set_tai_offset(s32 tai_offset)
-{
- struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
- __timekeeping_set_tai_offset(tk, tai_offset);
- timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
- write_seqcount_end(&tk_core.seq);
- raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
- clock_was_set();
-}
-
-/**
* change_clocksource - Swaps clocksources if a new one is available
*
* Accumulates current time interval and initializes new clocksource
@@ -1362,7 +1354,7 @@ void getrawmonotonic64(struct timespec64 *ts)
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 ts64;
unsigned long seq;
- s64 nsecs;
+ u64 nsecs;
do {
seq = read_seqcount_begin(&tk_core.seq);
@@ -1613,7 +1605,7 @@ void timekeeping_resume(void)
struct clocksource *clock = tk->tkr_mono.clock;
unsigned long flags;
struct timespec64 ts_new, ts_delta;
- cycle_t cycle_now, cycle_delta;
+ u64 cycle_now;
sleeptime_injected = false;
read_persistent_clock64(&ts_new);
@@ -1639,27 +1631,11 @@ void timekeeping_resume(void)
cycle_now = tk->tkr_mono.read(clock);
if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
cycle_now > tk->tkr_mono.cycle_last) {
- u64 num, max = ULLONG_MAX;
- u32 mult = clock->mult;
- u32 shift = clock->shift;
- s64 nsec = 0;
-
- cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
- tk->tkr_mono.mask);
-
- /*
- * "cycle_delta * mutl" may cause 64 bits overflow, if the
- * suspended time is too long. In that case we need do the
- * 64 bits math carefully
- */
- do_div(max, mult);
- if (cycle_delta > max) {
- num = div64_u64(cycle_delta, max);
- nsec = (((u64) max * mult) >> shift) * num;
- cycle_delta -= num * max;
- }
- nsec += ((u64) cycle_delta * mult) >> shift;
+ u64 nsec, cyc_delta;
+ cyc_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
+ tk->tkr_mono.mask);
+ nsec = mul_u64_u32_shr(cyc_delta, clock->mult, clock->shift);
ts_delta = ns_to_timespec64(nsec);
sleeptime_injected = true;
} else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
@@ -1995,11 +1971,10 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
*
* Returns the unconsumed cycles.
*/
-static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
- u32 shift,
- unsigned int *clock_set)
+static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
+ u32 shift, unsigned int *clock_set)
{
- cycle_t interval = tk->cycle_interval << shift;
+ u64 interval = tk->cycle_interval << shift;
u64 raw_nsecs;
/* If the offset is smaller than a shifted interval, do nothing */
@@ -2040,7 +2015,7 @@ void update_wall_time(void)
{
struct timekeeper *real_tk = &tk_core.timekeeper;
struct timekeeper *tk = &shadow_timekeeper;
- cycle_t offset;
+ u64 offset;
int shift = 0, maxshift;
unsigned int clock_set = 0;
unsigned long flags;
@@ -2238,7 +2213,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
}
/* Handle leapsecond insertion adjustments */
- if (unlikely(base.tv64 >= tk->next_leap_ktime.tv64))
+ if (unlikely(base >= tk->next_leap_ktime))
*offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));
} while (read_seqcount_retry(&tk_core.seq, seq));
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 704f595ce83f..d0914676d4c5 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -11,8 +11,6 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
extern int timekeeping_valid_for_hres(void);
extern u64 timekeeping_max_deferment(void);
extern int timekeeping_inject_offset(struct timespec *ts);
-extern s32 timekeeping_get_tai_offset(void);
-extern void timekeeping_set_tai_offset(s32 tai_offset);
extern int timekeeping_suspend(void);
extern void timekeeping_resume(void);
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 107310a6f36f..38bc4d2208e8 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -75,5 +75,7 @@ void tk_debug_account_sleep_time(struct timespec64 *t)
int bin = min(fls(t->tv_sec), NUM_BINS-1);
sleep_time_bin[bin]++;
+ printk_deferred(KERN_INFO "Suspended for %lld.%03lu seconds\n",
+ (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
}
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 5be76270ec4a..9a18f121f399 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -13,9 +13,9 @@ extern void tk_debug_account_sleep_time(struct timespec64 *t);
#endif
#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE
-static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
+static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
{
- cycle_t ret = (now - last) & mask;
+ u64 ret = (now - last) & mask;
/*
* Prevent time going backwards by checking the MSB of mask in
@@ -24,7 +24,7 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
return ret & ~(mask >> 1) ? 0 : ret;
}
#else
-static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
+static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
{
return (now - last) & mask;
}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 32bf6f75a8fe..82a6bfa0c307 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -43,7 +43,7 @@
#include <linux/slab.h>
#include <linux/compat.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/div64.h>
#include <asm/timex.h>
@@ -571,38 +571,6 @@ internal_add_timer(struct timer_base *base, struct timer_list *timer)
trigger_dyntick_cpu(base, timer);
}
-#ifdef CONFIG_TIMER_STATS
-void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
-{
- if (timer->start_site)
- return;
-
- timer->start_site = addr;
- memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
- timer->start_pid = current->pid;
-}
-
-static void timer_stats_account_timer(struct timer_list *timer)
-{
- void *site;
-
- /*
- * start_site can be concurrently reset by
- * timer_stats_timer_clear_start_info()
- */
- site = READ_ONCE(timer->start_site);
- if (likely(!site))
- return;
-
- timer_stats_update_stats(timer, timer->start_pid, site,
- timer->function, timer->start_comm,
- timer->flags);
-}
-
-#else
-static void timer_stats_account_timer(struct timer_list *timer) {}
-#endif
-
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
static struct debug_obj_descr timer_debug_descr;
@@ -789,11 +757,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
{
timer->entry.pprev = NULL;
timer->flags = flags | raw_smp_processor_id();
-#ifdef CONFIG_TIMER_STATS
- timer->start_site = NULL;
- timer->start_pid = -1;
- memset(timer->start_comm, 0, TASK_COMM_LEN);
-#endif
lockdep_init_map(&timer->lockdep_map, name, key, 0);
}
@@ -878,7 +841,7 @@ static inline struct timer_base *get_timer_base(u32 tflags)
#ifdef CONFIG_NO_HZ_COMMON
static inline struct timer_base *
-__get_target_base(struct timer_base *base, unsigned tflags)
+get_target_base(struct timer_base *base, unsigned tflags)
{
#ifdef CONFIG_SMP
if ((tflags & TIMER_PINNED) || !base->migration_enabled)
@@ -891,25 +854,27 @@ __get_target_base(struct timer_base *base, unsigned tflags)
static inline void forward_timer_base(struct timer_base *base)
{
+ unsigned long jnow = READ_ONCE(jiffies);
+
/*
* We only forward the base when it's idle and we have a delta between
* base clock and jiffies.
*/
- if (!base->is_idle || (long) (jiffies - base->clk) < 2)
+ if (!base->is_idle || (long) (jnow - base->clk) < 2)
return;
/*
* If the next expiry value is > jiffies, then we fast forward to
* jiffies otherwise we forward to the next expiry value.
*/
- if (time_after(base->next_expiry, jiffies))
- base->clk = jiffies;
+ if (time_after(base->next_expiry, jnow))
+ base->clk = jnow;
else
base->clk = base->next_expiry;
}
#else
static inline struct timer_base *
-__get_target_base(struct timer_base *base, unsigned tflags)
+get_target_base(struct timer_base *base, unsigned tflags)
{
return get_timer_this_cpu_base(tflags);
}
@@ -917,14 +882,6 @@ __get_target_base(struct timer_base *base, unsigned tflags)
static inline void forward_timer_base(struct timer_base *base) { }
#endif
-static inline struct timer_base *
-get_target_base(struct timer_base *base, unsigned tflags)
-{
- struct timer_base *target = __get_target_base(base, tflags);
-
- forward_timer_base(target);
- return target;
-}
/*
* We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
@@ -943,7 +900,14 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
{
for (;;) {
struct timer_base *base;
- u32 tf = timer->flags;
+ u32 tf;
+
+ /*
+ * We need to use READ_ONCE() here, otherwise the compiler
+ * might re-read @tf between the check for TIMER_MIGRATING
+ * and spin_lock().
+ */
+ tf = READ_ONCE(timer->flags);
if (!(tf & TIMER_MIGRATING)) {
base = get_timer_base(tf);
@@ -964,6 +928,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
unsigned long clk = 0, flags;
int ret = 0;
+ BUG_ON(!timer->function);
+
/*
* This is a common optimization triggered by the networking code - if
* the timer is re-modified to have the same timeout or ends up in the
@@ -972,13 +938,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
if (timer_pending(timer)) {
if (timer->expires == expires)
return 1;
+
/*
- * Take the current timer_jiffies of base, but without holding
- * the lock!
+ * We lock timer base and calculate the bucket index right
+ * here. If the timer ends up in the same bucket, then we
+ * just update the expiry time and avoid the whole
+ * dequeue/enqueue dance.
*/
- base = get_timer_base(timer->flags);
- clk = base->clk;
+ base = lock_timer_base(timer, &flags);
+ clk = base->clk;
idx = calc_wheel_index(expires, clk);
/*
@@ -988,15 +957,13 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
*/
if (idx == timer_get_idx(timer)) {
timer->expires = expires;
- return 1;
+ ret = 1;
+ goto out_unlock;
}
+ } else {
+ base = lock_timer_base(timer, &flags);
}
- timer_stats_timer_set_start_info(timer);
- BUG_ON(!timer->function);
-
- base = lock_timer_base(timer, &flags);
-
ret = detach_if_pending(timer, base, false);
if (!ret && pending_only)
goto out_unlock;
@@ -1025,12 +992,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
}
}
+ /* Try to forward a stale timer base clock */
+ forward_timer_base(base);
+
timer->expires = expires;
/*
* If 'idx' was calculated above and the base time did not advance
- * between calculating 'idx' and taking the lock, only enqueue_timer()
- * and trigger_dyntick_cpu() is required. Otherwise we need to
- * (re)calculate the wheel index via internal_add_timer().
+ * between calculating 'idx' and possibly switching the base, only
+ * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise
+ * we need to (re)calculate the wheel index via
+ * internal_add_timer().
*/
if (idx != UINT_MAX && clk == base->clk) {
enqueue_timer(base, timer, idx);
@@ -1120,7 +1091,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
struct timer_base *new_base, *base;
unsigned long flags;
- timer_stats_timer_set_start_info(timer);
BUG_ON(timer_pending(timer) || !timer->function);
new_base = get_timer_cpu_base(timer->flags, cpu);
@@ -1166,7 +1136,6 @@ int del_timer(struct timer_list *timer)
debug_assert_init(timer);
- timer_stats_timer_clear_start_info(timer);
if (timer_pending(timer)) {
base = lock_timer_base(timer, &flags);
ret = detach_if_pending(timer, base, true);
@@ -1194,10 +1163,9 @@ int try_to_del_timer_sync(struct timer_list *timer)
base = lock_timer_base(timer, &flags);
- if (base->running_timer != timer) {
- timer_stats_timer_clear_start_info(timer);
+ if (base->running_timer != timer)
ret = detach_if_pending(timer, base, true);
- }
+
spin_unlock_irqrestore(&base->lock, flags);
return ret;
@@ -1321,7 +1289,6 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
unsigned long data;
timer = hlist_entry(head->first, struct timer_list, entry);
- timer_stats_account_timer(timer);
base->running_timer = timer;
detach_timer(timer, true);
@@ -1510,12 +1477,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
base->next_expiry = nextevt;
/*
- * We have a fresh next event. Check whether we can forward the base:
+ * We have a fresh next event. Check whether we can forward the
+ * base. We can only do that when @basej is past base->clk
+ * otherwise we might rewind base->clk.
*/
- if (time_after(nextevt, jiffies))
- base->clk = jiffies;
- else if (time_after(nextevt, base->clk))
- base->clk = nextevt;
+ if (time_after(basej, base->clk)) {
+ if (time_after(nextevt, basej))
+ base->clk = basej;
+ else if (time_after(nextevt, base->clk))
+ base->clk = nextevt;
+ }
if (time_before_eq(nextevt, basej)) {
expires = basem;
@@ -1601,7 +1572,8 @@ void update_process_times(int user_tick)
irq_work_tick();
#endif
scheduler_tick();
- run_posix_cpu_timers(p);
+ if (IS_ENABLED(CONFIG_POSIX_TIMERS))
+ run_posix_cpu_timers(p);
}
/**
@@ -1633,7 +1605,7 @@ static inline void __run_timers(struct timer_base *base)
/*
* This function runs timers and the timer-tq in bottom half context.
*/
-static void run_timer_softirq(struct softirq_action *h)
+static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
@@ -1662,19 +1634,6 @@ void run_local_timers(void)
raise_softirq(TIMER_SOFTIRQ);
}
-#ifdef __ARCH_WANT_SYS_ALARM
-
-/*
- * For backwards compatibility? This can be done in libc so Alpha
- * and all newer ports shouldn't need it.
- */
-SYSCALL_DEFINE1(alarm, unsigned int, seconds)
-{
- return alarm_setitimer(seconds);
-}
-
-#endif
-
static void process_timeout(unsigned long __data)
{
wake_up_process((struct task_struct *)__data);
@@ -1691,11 +1650,12 @@ static void process_timeout(unsigned long __data)
* You can set the task state as follows -
*
* %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
- * pass before the routine returns. The routine will return 0
+ * pass before the routine returns unless the current task is explicitly
+ * woken up, (e.g. by wake_up_process())".
*
* %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task. In this case the remaining time
- * in jiffies will be returned, or 0 if the timer expired in time
+ * delivered to the current task or the current task is explicitly woken
+ * up.
*
* The current task state is guaranteed to be TASK_RUNNING when this
* routine returns.
@@ -1704,7 +1664,9 @@ static void process_timeout(unsigned long __data)
* the CPU away without a bound on the timeout. In this case the return
* value will be %MAX_SCHEDULE_TIMEOUT.
*
- * In all cases the return value is guaranteed to be non-negative.
+ * Returns 0 when the timer has expired otherwise the remaining time in
+ * jiffies will be returned. In all cases the return value is guaranteed
+ * to be non-negative.
*/
signed long __sched schedule_timeout(signed long timeout)
{
@@ -1863,7 +1825,6 @@ static void __init init_timer_cpus(void)
void __init init_timers(void)
{
init_timer_cpus();
- init_timer_stats();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
@@ -1896,16 +1857,6 @@ unsigned long msleep_interruptible(unsigned int msecs)
EXPORT_SYMBOL(msleep_interruptible);
-static void __sched do_usleep_range(unsigned long min, unsigned long max)
-{
- ktime_t kmin;
- u64 delta;
-
- kmin = ktime_set(0, min * NSEC_PER_USEC);
- delta = (u64)(max - min) * NSEC_PER_USEC;
- schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
-}
-
/**
* usleep_range - Sleep for an approximate time
* @min: Minimum time in usecs to sleep
@@ -1919,7 +1870,14 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max)
*/
void __sched usleep_range(unsigned long min, unsigned long max)
{
- __set_current_state(TASK_UNINTERRUPTIBLE);
- do_usleep_range(min, max);
+ ktime_t exp = ktime_add_us(ktime_get(), min);
+ u64 delta = (u64)(max - min) * NSEC_PER_USEC;
+
+ for (;;) {
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ /* Do not return before the requested sleep time has elapsed */
+ if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
+ break;
+ }
}
EXPORT_SYMBOL(usleep_range);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ba7d8b288bb3..ff8d5c13d04b 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -17,7 +17,7 @@
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "tick-internal.h"
@@ -62,21 +62,11 @@ static void
print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
int idx, u64 now)
{
-#ifdef CONFIG_TIMER_STATS
- char tmp[TASK_COMM_LEN + 1];
-#endif
SEQ_printf(m, " #%d: ", idx);
print_name_offset(m, taddr);
SEQ_printf(m, ", ");
print_name_offset(m, timer->function);
SEQ_printf(m, ", S:%02x", timer->state);
-#ifdef CONFIG_TIMER_STATS
- SEQ_printf(m, ", ");
- print_name_offset(m, timer->start_site);
- memcpy(tmp, timer->start_comm, TASK_COMM_LEN);
- tmp[TASK_COMM_LEN] = 0;
- SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
-#endif
SEQ_printf(m, "\n");
SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
(unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
@@ -127,7 +117,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
SEQ_printf(m, " .base: %pK\n", base);
SEQ_printf(m, " .index: %d\n", base->index);
- SEQ_printf(m, " .resolution: %u nsecs\n", (unsigned) hrtimer_resolution);
+ SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution);
SEQ_printf(m, " .get_time: ");
print_name_offset(m, base->get_time);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
deleted file mode 100644
index 087204c733eb..000000000000
--- a/kernel/time/timer_stats.c
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * kernel/time/timer_stats.c
- *
- * Collect timer usage statistics.
- *
- * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
- * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * timer_stats is based on timer_top, a similar functionality which was part of
- * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the
- * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based
- * on dynamic allocation of the statistics entries and linear search based
- * lookup combined with a global lock, rather than the static array, hash
- * and per-CPU locking which is used by timer_stats. It was written for the
- * pre hrtimer kernel code and therefore did not take hrtimers into account.
- * Nevertheless it provided the base for the timer_stats implementation and
- * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks
- * for this effort.
- *
- * timer_top.c is
- * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus
- * Written by Daniel Petrini <d.pensator@gmail.com>
- * timer_top.c was released under the GNU General Public License version 2
- *
- * We export the addresses and counting of timer functions being called,
- * the pid and cmdline from the owner process if applicable.
- *
- * Start/stop data collection:
- * # echo [1|0] >/proc/timer_stats
- *
- * Display the information collected so far:
- * # cat /proc/timer_stats
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/proc_fs.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-#include <linux/kallsyms.h>
-
-#include <asm/uaccess.h>
-
-/*
- * This is our basic unit of interest: a timer expiry event identified
- * by the timer, its start/expire functions and the PID of the task that
- * started the timer. We count the number of times an event happens:
- */
-struct entry {
- /*
- * Hash list:
- */
- struct entry *next;
-
- /*
- * Hash keys:
- */
- void *timer;
- void *start_func;
- void *expire_func;
- pid_t pid;
-
- /*
- * Number of timeout events:
- */
- unsigned long count;
- u32 flags;
-
- /*
- * We save the command-line string to preserve
- * this information past task exit:
- */
- char comm[TASK_COMM_LEN + 1];
-
-} ____cacheline_aligned_in_smp;
-
-/*
- * Spinlock protecting the tables - not taken during lookup:
- */
-static DEFINE_RAW_SPINLOCK(table_lock);
-
-/*
- * Per-CPU lookup locks for fast hash lookup:
- */
-static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock);
-
-/*
- * Mutex to serialize state changes with show-stats activities:
- */
-static DEFINE_MUTEX(show_mutex);
-
-/*
- * Collection status, active/inactive:
- */
-int __read_mostly timer_stats_active;
-
-/*
- * Beginning/end timestamps of measurement:
- */
-static ktime_t time_start, time_stop;
-
-/*
- * tstat entry structs only get allocated while collection is
- * active and never freed during that time - this simplifies
- * things quite a bit.
- *
- * They get freed when a new collection period is started.
- */
-#define MAX_ENTRIES_BITS 10
-#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS)
-
-static unsigned long nr_entries;
-static struct entry entries[MAX_ENTRIES];
-
-static atomic_t overflow_count;
-
-/*
- * The entries are in a hash-table, for fast lookup:
- */
-#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1)
-#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS)
-#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1)
-
-#define __tstat_hashfn(entry) \
- (((unsigned long)(entry)->timer ^ \
- (unsigned long)(entry)->start_func ^ \
- (unsigned long)(entry)->expire_func ^ \
- (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK)
-
-#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry))
-
-static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
-
-static void reset_entries(void)
-{
- nr_entries = 0;
- memset(entries, 0, sizeof(entries));
- memset(tstat_hash_table, 0, sizeof(tstat_hash_table));
- atomic_set(&overflow_count, 0);
-}
-
-static struct entry *alloc_entry(void)
-{
- if (nr_entries >= MAX_ENTRIES)
- return NULL;
-
- return entries + nr_entries++;
-}
-
-static int match_entries(struct entry *entry1, struct entry *entry2)
-{
- return entry1->timer == entry2->timer &&
- entry1->start_func == entry2->start_func &&
- entry1->expire_func == entry2->expire_func &&
- entry1->pid == entry2->pid;
-}
-
-/*
- * Look up whether an entry matching this item is present
- * in the hash already. Must be called with irqs off and the
- * lookup lock held:
- */
-static struct entry *tstat_lookup(struct entry *entry, char *comm)
-{
- struct entry **head, *curr, *prev;
-
- head = tstat_hashentry(entry);
- curr = *head;
-
- /*
- * The fastpath is when the entry is already hashed,
- * we do this with the lookup lock held, but with the
- * table lock not held:
- */
- while (curr) {
- if (match_entries(curr, entry))
- return curr;
-
- curr = curr->next;
- }
- /*
- * Slowpath: allocate, set up and link a new hash entry:
- */
- prev = NULL;
- curr = *head;
-
- raw_spin_lock(&table_lock);
- /*
- * Make sure we have not raced with another CPU:
- */
- while (curr) {
- if (match_entries(curr, entry))
- goto out_unlock;
-
- prev = curr;
- curr = curr->next;
- }
-
- curr = alloc_entry();
- if (curr) {
- *curr = *entry;
- curr->count = 0;
- curr->next = NULL;
- memcpy(curr->comm, comm, TASK_COMM_LEN);
-
- smp_mb(); /* Ensure that curr is initialized before insert */
-
- if (prev)
- prev->next = curr;
- else
- *head = curr;
- }
- out_unlock:
- raw_spin_unlock(&table_lock);
-
- return curr;
-}
-
-/**
- * timer_stats_update_stats - Update the statistics for a timer.
- * @timer: pointer to either a timer_list or a hrtimer
- * @pid: the pid of the task which set up the timer
- * @startf: pointer to the function which did the timer setup
- * @timerf: pointer to the timer callback function of the timer
- * @comm: name of the process which set up the timer
- * @tflags: The flags field of the timer
- *
- * When the timer is already registered, then the event counter is
- * incremented. Otherwise the timer is registered in a free slot.
- */
-void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
- void *timerf, char *comm, u32 tflags)
-{
- /*
- * It doesn't matter which lock we take:
- */
- raw_spinlock_t *lock;
- struct entry *entry, input;
- unsigned long flags;
-
- if (likely(!timer_stats_active))
- return;
-
- lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id());
-
- input.timer = timer;
- input.start_func = startf;
- input.expire_func = timerf;
- input.pid = pid;
- input.flags = tflags;
-
- raw_spin_lock_irqsave(lock, flags);
- if (!timer_stats_active)
- goto out_unlock;
-
- entry = tstat_lookup(&input, comm);
- if (likely(entry))
- entry->count++;
- else
- atomic_inc(&overflow_count);
-
- out_unlock:
- raw_spin_unlock_irqrestore(lock, flags);
-}
-
-static void print_name_offset(struct seq_file *m, unsigned long addr)
-{
- char symname[KSYM_NAME_LEN];
-
- if (lookup_symbol_name(addr, symname) < 0)
- seq_printf(m, "<%p>", (void *)addr);
- else
- seq_printf(m, "%s", symname);
-}
-
-static int tstats_show(struct seq_file *m, void *v)
-{
- struct timespec64 period;
- struct entry *entry;
- unsigned long ms;
- long events = 0;
- ktime_t time;
- int i;
-
- mutex_lock(&show_mutex);
- /*
- * If still active then calculate up to now:
- */
- if (timer_stats_active)
- time_stop = ktime_get();
-
- time = ktime_sub(time_stop, time_start);
-
- period = ktime_to_timespec64(time);
- ms = period.tv_nsec / 1000000;
-
- seq_puts(m, "Timer Stats Version: v0.3\n");
- seq_printf(m, "Sample period: %ld.%03ld s\n", (long)period.tv_sec, ms);
- if (atomic_read(&overflow_count))
- seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
- seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
-
- for (i = 0; i < nr_entries; i++) {
- entry = entries + i;
- if (entry->flags & TIMER_DEFERRABLE) {
- seq_printf(m, "%4luD, %5d %-16s ",
- entry->count, entry->pid, entry->comm);
- } else {
- seq_printf(m, " %4lu, %5d %-16s ",
- entry->count, entry->pid, entry->comm);
- }
-
- print_name_offset(m, (unsigned long)entry->start_func);
- seq_puts(m, " (");
- print_name_offset(m, (unsigned long)entry->expire_func);
- seq_puts(m, ")\n");
-
- events += entry->count;
- }
-
- ms += period.tv_sec * 1000;
- if (!ms)
- ms = 1;
-
- if (events && period.tv_sec)
- seq_printf(m, "%ld total events, %ld.%03ld events/sec\n",
- events, events * 1000 / ms,
- (events * 1000000 / ms) % 1000);
- else
- seq_printf(m, "%ld total events\n", events);
-
- mutex_unlock(&show_mutex);
-
- return 0;
-}
-
-/*
- * After a state change, make sure all concurrent lookup/update
- * activities have stopped:
- */
-static void sync_access(void)
-{
- unsigned long flags;
- int cpu;
-
- for_each_online_cpu(cpu) {
- raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu);
-
- raw_spin_lock_irqsave(lock, flags);
- /* nothing */
- raw_spin_unlock_irqrestore(lock, flags);
- }
-}
-
-static ssize_t tstats_write(struct file *file, const char __user *buf,
- size_t count, loff_t *offs)
-{
- char ctl[2];
-
- if (count != 2 || *offs)
- return -EINVAL;
-
- if (copy_from_user(ctl, buf, count))
- return -EFAULT;
-
- mutex_lock(&show_mutex);
- switch (ctl[0]) {
- case '0':
- if (timer_stats_active) {
- timer_stats_active = 0;
- time_stop = ktime_get();
- sync_access();
- }
- break;
- case '1':
- if (!timer_stats_active) {
- reset_entries();
- time_start = ktime_get();
- smp_mb();
- timer_stats_active = 1;
- }
- break;
- default:
- count = -EINVAL;
- }
- mutex_unlock(&show_mutex);
-
- return count;
-}
-
-static int tstats_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, tstats_show, NULL);
-}
-
-static const struct file_operations tstats_fops = {
- .open = tstats_open,
- .read = seq_read,
- .write = tstats_write,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-void __init init_timer_stats(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu));
-}
-
-static int __init init_tstats_procfs(void)
-{
- struct proc_dir_entry *pe;
-
- pe = proc_create("timer_stats", 0644, NULL, &tstats_fops);
- if (!pe)
- return -ENOMEM;
- return 0;
-}
-__initcall(init_tstats_procfs);
diff --git a/kernel/torture.c b/kernel/torture.c
index 75961b3decfe..0d887eb62856 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -43,6 +43,7 @@
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/trace_clock.h>
+#include <linux/ktime.h>
#include <asm/byteorder.h>
#include <linux/torture.h>
@@ -446,9 +447,8 @@ EXPORT_SYMBOL_GPL(torture_shuffle_cleanup);
* Variables for auto-shutdown. This allows "lights out" torture runs
* to be fully scripted.
*/
-static int shutdown_secs; /* desired test duration in seconds. */
static struct task_struct *shutdown_task;
-static unsigned long shutdown_time; /* jiffies to system shutdown. */
+static ktime_t shutdown_time; /* time to system shutdown. */
static void (*torture_shutdown_hook)(void);
/*
@@ -471,20 +471,20 @@ EXPORT_SYMBOL_GPL(torture_shutdown_absorb);
*/
static int torture_shutdown(void *arg)
{
- long delta;
- unsigned long jiffies_snap;
+ ktime_t ktime_snap;
VERBOSE_TOROUT_STRING("torture_shutdown task started");
- jiffies_snap = jiffies;
- while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
+ ktime_snap = ktime_get();
+ while (ktime_before(ktime_snap, shutdown_time) &&
!torture_must_stop()) {
- delta = shutdown_time - jiffies_snap;
if (verbose)
pr_alert("%s" TORTURE_FLAG
- "torture_shutdown task: %lu jiffies remaining\n",
- torture_type, delta);
- schedule_timeout_interruptible(delta);
- jiffies_snap = jiffies;
+ "torture_shutdown task: %llu ms remaining\n",
+ torture_type,
+ ktime_ms_delta(shutdown_time, ktime_snap));
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_hrtimeout(&shutdown_time, HRTIMER_MODE_ABS);
+ ktime_snap = ktime_get();
}
if (torture_must_stop()) {
torture_kthread_stopping("torture_shutdown");
@@ -511,10 +511,9 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void))
{
int ret = 0;
- shutdown_secs = ssecs;
torture_shutdown_hook = cleanup;
- if (shutdown_secs > 0) {
- shutdown_time = jiffies + shutdown_secs * HZ;
+ if (ssecs > 0) {
+ shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0));
ret = torture_create_kthread(torture_shutdown, NULL,
shutdown_task);
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f4b86e8ca1e7..d5038005eb5d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -24,11 +24,6 @@ config HAVE_FUNCTION_GRAPH_TRACER
help
See Documentation/trace/ftrace-design.txt
-config HAVE_FUNCTION_GRAPH_FP_TEST
- bool
- help
- See Documentation/trace/ftrace-design.txt
-
config HAVE_DYNAMIC_FTRACE
bool
help
@@ -75,6 +70,7 @@ config FTRACE_NMI_ENTER
config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
+ select GLOB
bool
config CONTEXT_SWITCH_TRACER
@@ -138,6 +134,7 @@ config FUNCTION_TRACER
select KALLSYMS
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
+ select GLOB
help
Enable the kernel to trace every kernel function. This is done
by using a compiler feature to insert a small, 5-byte No-Operation
@@ -221,6 +218,41 @@ config SCHED_TRACER
This tracer tracks the latency of the highest priority task
to be scheduled in, starting from the point it has woken up.
+config HWLAT_TRACER
+ bool "Tracer to detect hardware latencies (like SMIs)"
+ select GENERIC_TRACER
+ help
+ This tracer, when enabled will create one or more kernel threads,
+ depening on what the cpumask file is set to, which each thread
+ spinning in a loop looking for interruptions caused by
+ something other than the kernel. For example, if a
+ System Management Interrupt (SMI) takes a noticeable amount of
+ time, this tracer will detect it. This is useful for testing
+ if a system is reliable for Real Time tasks.
+
+ Some files are created in the tracing directory when this
+ is enabled:
+
+ hwlat_detector/width - time in usecs for how long to spin for
+ hwlat_detector/window - time in usecs between the start of each
+ iteration
+
+ A kernel thread is created that will spin with interrupts disabled
+ for "width" microseconds in every "widow" cycle. It will not spin
+ for "window - width" microseconds, where the system can
+ continue to operate.
+
+ The output will appear in the trace and trace_pipe files.
+
+ When the tracer is not running, it has no affect on the system,
+ but when it is running, it can cause the system to be
+ periodically non responsive. Do not run this tracer on a
+ production system.
+
+ To enable this tracer, echo in "hwlat" into the current_tracer
+ file. Every time a latency is greater than tracing_thresh, it will
+ be recorded into the ring buffer.
+
config ENABLE_DEFAULT_TRACERS
bool "Trace process context switches and events"
depends on !GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d0a1617b52b4..e57980845549 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,8 +1,4 @@
-# We are fully aware of the dangers of __builtin_return_address()
-FRAME_CFLAGS := $(call cc-disable-warning,frame-address)
-KBUILD_CFLAGS += $(FRAME_CFLAGS)
-
# Do not instrument the tracer itself:
ifdef CONFIG_FUNCTION_TRACER
@@ -41,6 +37,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
obj-$(CONFIG_NOP_TRACER) += trace_nop.o
obj-$(CONFIG_STACK_TRACER) += trace_stack.o
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index dbafc5df03f3..b2058a7f94bd 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -28,6 +28,8 @@
#include <linux/uaccess.h>
#include <linux/list.h>
+#include "../../block/blk.h"
+
#include <trace/events/block.h>
#include "trace_output.h"
@@ -292,9 +294,6 @@ record_it:
local_irq_restore(flags);
}
-static struct dentry *blk_tree_root;
-static DEFINE_MUTEX(blk_tree_mutex);
-
static void blk_trace_free(struct blk_trace *bt)
{
debugfs_remove(bt->msg_file);
@@ -433,9 +432,9 @@ static void blk_trace_setup_lba(struct blk_trace *bt,
/*
* Setup everything required to start tracing
*/
-int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct block_device *bdev,
- struct blk_user_trace_setup *buts)
+static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev,
+ struct blk_user_trace_setup *buts)
{
struct blk_trace *bt = NULL;
struct dentry *dir = NULL;
@@ -468,22 +467,15 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
ret = -ENOENT;
- mutex_lock(&blk_tree_mutex);
- if (!blk_tree_root) {
- blk_tree_root = debugfs_create_dir("block", NULL);
- if (!blk_tree_root) {
- mutex_unlock(&blk_tree_mutex);
- goto err;
- }
- }
- mutex_unlock(&blk_tree_mutex);
-
- dir = debugfs_create_dir(buts->name, blk_tree_root);
+ if (!blk_debugfs_root)
+ goto err;
+ dir = debugfs_lookup(buts->name, blk_debugfs_root);
+ if (!dir)
+ bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
if (!dir)
goto err;
- bt->dir = dir;
bt->dev = dev;
atomic_set(&bt->dropped, 0);
INIT_LIST_HEAD(&bt->running_list);
@@ -525,9 +517,12 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (atomic_inc_return(&blk_probes_ref) == 1)
blk_register_tracepoints();
- return 0;
+ ret = 0;
err:
- blk_trace_free(bt);
+ if (dir && !bt->dir)
+ dput(dir);
+ if (ret)
+ blk_trace_free(bt);
return ret;
}
@@ -712,15 +707,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
if (likely(!bt))
return;
- if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
+ if (blk_rq_is_passthrough(rq))
what |= BLK_TC_ACT(BLK_TC_PC);
- __blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags,
- what, rq->errors, rq->cmd_len, rq->cmd);
- } else {
+ else
what |= BLK_TC_ACT(BLK_TC_FS);
- __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq),
- rq->cmd_flags, what, rq->errors, 0, NULL);
- }
+
+ __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
+ rq->cmd_flags, what, rq->errors, 0, NULL);
}
static void blk_add_trace_rq_abort(void *ignore,
@@ -972,11 +965,7 @@ void blk_add_driver_data(struct request_queue *q,
if (likely(!bt))
return;
- if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
- __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0,
- BLK_TA_DRV_DATA, rq->errors, len, data);
- else
- __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0,
+ __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, rq->errors, len, data);
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1752,39 +1741,14 @@ void blk_trace_remove_sysfs(struct device *dev)
#ifdef CONFIG_EVENT_TRACING
-void blk_dump_cmd(char *buf, struct request *rq)
-{
- int i, end;
- int len = rq->cmd_len;
- unsigned char *cmd = rq->cmd;
-
- if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
- buf[0] = '\0';
- return;
- }
-
- for (end = len - 1; end >= 0; end--)
- if (cmd[end])
- break;
- end++;
-
- for (i = 0; i < len; i++) {
- buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
- if (i == end && end != len - 1) {
- sprintf(buf, " ..");
- break;
- }
- }
-}
-
-void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
+void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes)
{
int i = 0;
- if (rw & REQ_PREFLUSH)
+ if (op & REQ_PREFLUSH)
rwbs[i++] = 'F';
- switch (op) {
+ switch (op & REQ_OP_MASK) {
case REQ_OP_WRITE:
case REQ_OP_WRITE_SAME:
rwbs[i++] = 'W';
@@ -1806,13 +1770,13 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
rwbs[i++] = 'N';
}
- if (rw & REQ_FUA)
+ if (op & REQ_FUA)
rwbs[i++] = 'F';
- if (rw & REQ_RAHEAD)
+ if (op & REQ_RAHEAD)
rwbs[i++] = 'A';
- if (rw & REQ_SYNC)
+ if (op & REQ_SYNC)
rwbs[i++] = 'S';
- if (rw & REQ_META)
+ if (op & REQ_META)
rwbs[i++] = 'M';
rwbs[i] = '\0';
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b20438fdb029..fa77311dadb2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1,4 +1,5 @@
/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -8,6 +9,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
+#include <linux/bpf_perf_event.h>
#include <linux/filter.h>
#include <linux/uaccess.h>
#include <linux/ctype.h>
@@ -59,11 +61,9 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
}
EXPORT_SYMBOL_GPL(trace_call_bpf);
-static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
{
- void *dst = (void *) (long) r1;
- int ret, size = (int) r2;
- void *unsafe_ptr = (void *) (long) r3;
+ int ret;
ret = probe_kernel_read(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
@@ -81,12 +81,9 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
.arg3_type = ARG_ANYTHING,
};
-static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
+ u32, size)
{
- void *unsafe_ptr = (void *) (long) r1;
- void *src = (void *) (long) r2;
- int size = (int) r3;
-
/*
* Ensure we're in user context which is safe for the helper to
* run. This helper has no business in a kthread.
@@ -128,9 +125,9 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
* limited trace_printk()
* only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
*/
-static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
+BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
+ u64, arg2, u64, arg3)
{
- char *fmt = (char *) (long) r1;
bool str_seen = false;
int mod[3] = {};
int fmt_cnt = 0;
@@ -176,16 +173,16 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
switch (fmt_cnt) {
case 1:
- unsafe_addr = r3;
- r3 = (long) buf;
+ unsafe_addr = arg1;
+ arg1 = (long) buf;
break;
case 2:
- unsafe_addr = r4;
- r4 = (long) buf;
+ unsafe_addr = arg2;
+ arg2 = (long) buf;
break;
case 3:
- unsafe_addr = r5;
- r5 = (long) buf;
+ unsafe_addr = arg3;
+ arg3 = (long) buf;
break;
}
buf[0] = 0;
@@ -207,9 +204,9 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
}
return __trace_printk(1/* fake ip will not be printed */, fmt,
- mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3,
- mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4,
- mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5);
+ mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1,
+ mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2,
+ mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3);
}
static const struct bpf_func_proto bpf_trace_printk_proto = {
@@ -231,9 +228,8 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
-static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
{
- struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
struct bpf_array *array = container_of(map, struct bpf_array, map);
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
@@ -310,11 +306,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
return 0;
}
-static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
+ u64, flags, void *, data, u64, size)
{
- struct pt_regs *regs = (struct pt_regs *)(long) r1;
- struct bpf_map *map = (struct bpf_map *)(long) r2;
- void *data = (void *)(long) r4;
struct perf_raw_record raw = {
.frag = {
.size = size,
@@ -365,7 +359,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
return __bpf_perf_event_output(regs, map, flags, &raw);
}
-static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_current_task)
{
return (long) current;
}
@@ -376,6 +370,31 @@ static const struct bpf_func_proto bpf_get_current_task_proto = {
.ret_type = RET_INTEGER,
};
+BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct cgroup *cgrp;
+
+ if (unlikely(in_interrupt()))
+ return -EINVAL;
+ if (unlikely(idx >= array->map.max_entries))
+ return -E2BIG;
+
+ cgrp = READ_ONCE(array->ptrs[idx]);
+ if (unlikely(!cgrp))
+ return -EAGAIN;
+
+ return task_under_cgroup_hierarchy(current, cgrp);
+}
+
+static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
+ .func = bpf_current_task_under_cgroup,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -403,10 +422,16 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
return bpf_get_trace_printk_proto();
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_get_numa_node_id:
+ return &bpf_get_numa_node_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
case BPF_FUNC_probe_write_user:
return bpf_get_probe_write_proto();
+ case BPF_FUNC_current_task_under_cgroup:
+ return &bpf_current_task_under_cgroup_proto;
+ case BPF_FUNC_get_prandom_u32:
+ return &bpf_get_prandom_u32_proto;
default:
return NULL;
}
@@ -447,16 +472,17 @@ static struct bpf_prog_type_list kprobe_tl = {
.type = BPF_PROG_TYPE_KPROBE,
};
-static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
+ u64, flags, void *, data, u64, size)
{
+ struct pt_regs *regs = *(struct pt_regs **)tp_buff;
+
/*
* r1 points to perf tracepoint buffer where first 8 bytes are hidden
* from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
- * from there and call the same bpf_perf_event_output() helper
+ * from there and call the same bpf_perf_event_output() helper inline.
*/
- u64 ctx = *(long *)(uintptr_t)r1;
-
- return bpf_perf_event_output(ctx, r2, index, r4, size);
+ return ____bpf_perf_event_output(regs, map, flags, data, size);
}
static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
@@ -470,11 +496,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
.arg5_type = ARG_CONST_STACK_SIZE,
};
-static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
+ u64, flags)
{
- u64 ctx = *(long *)(uintptr_t)r1;
+ struct pt_regs *regs = *(struct pt_regs **)tp_buff;
- return bpf_get_stackid(ctx, r2, r3, r4, r5);
+ /*
+ * Same comment as in bpf_perf_event_output_tp(), only that this time
+ * the other helper's function body cannot be inlined due to being
+ * external, thus we need to call raw helper function.
+ */
+ return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
+ flags, 0, 0);
}
static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
@@ -520,10 +553,69 @@ static struct bpf_prog_type_list tracepoint_tl = {
.type = BPF_PROG_TYPE_TRACEPOINT,
};
+static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
+{
+ if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
+ return false;
+ if (type != BPF_READ)
+ return false;
+ if (off % size != 0)
+ return false;
+ if (off == offsetof(struct bpf_perf_event_data, sample_period)) {
+ if (size != sizeof(u64))
+ return false;
+ } else {
+ if (size != sizeof(long))
+ return false;
+ }
+ return true;
+}
+
+static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+ int src_reg, int ctx_off,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (ctx_off) {
+ case offsetof(struct bpf_perf_event_data, sample_period):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
+ data), dst_reg, src_reg,
+ offsetof(struct bpf_perf_event_data_kern, data));
+ *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg,
+ offsetof(struct perf_sample_data, period));
+ break;
+ default:
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
+ regs), dst_reg, src_reg,
+ offsetof(struct bpf_perf_event_data_kern, regs));
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static const struct bpf_verifier_ops perf_event_prog_ops = {
+ .get_func_proto = tp_prog_func_proto,
+ .is_valid_access = pe_prog_is_valid_access,
+ .convert_ctx_access = pe_prog_convert_ctx_access,
+};
+
+static struct bpf_prog_type_list perf_event_tl = {
+ .ops = &perf_event_prog_ops,
+ .type = BPF_PROG_TYPE_PERF_EVENT,
+};
+
static int __init register_kprobe_prog_ops(void)
{
bpf_register_prog_type(&kprobe_tl);
bpf_register_prog_type(&tracepoint_tl);
+ bpf_register_prog_type(&perf_event_tl);
return 0;
}
late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 84752c8e28b5..eb230f06ba41 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -872,7 +872,13 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static int profile_graph_entry(struct ftrace_graph_ent *trace)
{
+ int index = trace->depth;
+
function_profile_call(trace->func, 0, NULL, NULL);
+
+ if (index >= 0 && index < FTRACE_RETFUNC_DEPTH)
+ current->ret_stack[index].subtime = 0;
+
return 1;
}
@@ -1856,6 +1862,10 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
/* Update rec->flags */
do_for_each_ftrace_rec(pg, rec) {
+
+ if (rec->flags & FTRACE_FL_DISABLED)
+ continue;
+
/* We need to update only differences of filter_hash */
in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
@@ -1878,6 +1888,10 @@ rollback:
/* Roll back what we did above */
do_for_each_ftrace_rec(pg, rec) {
+
+ if (rec->flags & FTRACE_FL_DISABLED)
+ continue;
+
if (rec == end)
goto err_out;
@@ -2391,6 +2405,10 @@ void __weak ftrace_replace_code(int enable)
return;
do_for_each_ftrace_rec(pg, rec) {
+
+ if (rec->flags & FTRACE_FL_DISABLED)
+ continue;
+
failed = __ftrace_replace_code(rec, enable);
if (failed) {
ftrace_bug(failed, rec);
@@ -2757,7 +2775,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
struct dyn_ftrace *rec;
do_for_each_ftrace_rec(pg, rec) {
- if (FTRACE_WARN_ON_ONCE(rec->flags))
+ if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED))
pr_warn(" %pS flags:%lx\n",
(void *)rec->ip, rec->flags);
} while_for_each_ftrace_rec();
@@ -2829,7 +2847,7 @@ static void ftrace_shutdown_sysctl(void)
}
}
-static cycle_t ftrace_update_time;
+static u64 ftrace_update_time;
unsigned long ftrace_update_tot_cnt;
static inline int ops_traces_mod(struct ftrace_ops *ops)
@@ -2876,7 +2894,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
struct ftrace_page *pg;
struct dyn_ftrace *p;
- cycle_t start, stop;
+ u64 start, stop;
unsigned long update_cnt = 0;
unsigned long rec_flags = 0;
int i;
@@ -3493,6 +3511,10 @@ static int ftrace_match(char *str, struct ftrace_glob *g)
memcmp(str + slen - g->len, g->search, g->len) == 0)
matched = 1;
break;
+ case MATCH_GLOB:
+ if (glob_match(g->search, str))
+ matched = 1;
+ break;
}
return matched;
@@ -3592,6 +3614,10 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
goto out_unlock;
do_for_each_ftrace_rec(pg, rec) {
+
+ if (rec->flags & FTRACE_FL_DISABLED)
+ continue;
+
if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) {
ret = enter_record(hash, rec, clear_filter);
if (ret < 0) {
@@ -3787,6 +3813,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
do_for_each_ftrace_rec(pg, rec) {
+ if (rec->flags & FTRACE_FL_DISABLED)
+ continue;
+
if (!ftrace_match_record(rec, &func_g, NULL, 0))
continue;
@@ -4233,6 +4262,23 @@ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
}
EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
+/**
+ * ftrace_ops_set_global_filter - setup ops to use global filters
+ * @ops - the ops which will use the global filters
+ *
+ * ftrace users who need global function trace filtering should call this.
+ * It can set the global filter only if ops were not initialized before.
+ */
+void ftrace_ops_set_global_filter(struct ftrace_ops *ops)
+{
+ if (ops->flags & FTRACE_OPS_FL_INITIALIZED)
+ return;
+
+ ftrace_ops_init(ops);
+ ops->func_hash = &global_ops.local_hash;
+}
+EXPORT_SYMBOL_GPL(ftrace_ops_set_global_filter);
+
static int
ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
int reset, int enable)
@@ -4679,6 +4725,9 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
do_for_each_ftrace_rec(pg, rec) {
+ if (rec->flags & FTRACE_FL_DISABLED)
+ continue;
+
if (ftrace_match_record(rec, &func_g, NULL, 0)) {
/* if it is in the array */
exists = false;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9c143739b8d7..a85739efcc30 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -245,7 +245,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
EXPORT_SYMBOL_GPL(ring_buffer_event_length);
/* inline for ring buffer fast paths */
-static void *
+static __always_inline void *
rb_event_data(struct ring_buffer_event *event)
{
if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
@@ -479,9 +479,7 @@ struct ring_buffer {
struct ring_buffer_per_cpu **buffers;
-#ifdef CONFIG_HOTPLUG_CPU
- struct notifier_block cpu_notify;
-#endif
+ struct hlist_node node;
u64 (*clock)(void);
struct rb_irq_work irq_work;
@@ -1274,11 +1272,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
kfree(cpu_buffer);
}
-#ifdef CONFIG_HOTPLUG_CPU
-static int rb_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu);
-#endif
-
/**
* __ring_buffer_alloc - allocate a new ring_buffer
* @size: the size in bytes per cpu that is needed.
@@ -1296,6 +1289,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
long nr_pages;
int bsize;
int cpu;
+ int ret;
/* keep it in its own cache line */
buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
@@ -1303,7 +1297,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!buffer)
return NULL;
- if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
goto fail_free_buffer;
nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
@@ -1318,17 +1312,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (nr_pages < 2)
nr_pages = 2;
- /*
- * In case of non-hotplug cpu, if the ring-buffer is allocated
- * in early initcall, it will not be notified of secondary cpus.
- * In that off case, we need to allocate for all possible cpus.
- */
-#ifdef CONFIG_HOTPLUG_CPU
- cpu_notifier_register_begin();
- cpumask_copy(buffer->cpumask, cpu_online_mask);
-#else
- cpumask_copy(buffer->cpumask, cpu_possible_mask);
-#endif
buffer->cpus = nr_cpu_ids;
bsize = sizeof(void *) * nr_cpu_ids;
@@ -1337,19 +1320,15 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!buffer->buffers)
goto fail_free_cpumask;
- for_each_buffer_cpu(buffer, cpu) {
- buffer->buffers[cpu] =
- rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
- if (!buffer->buffers[cpu])
- goto fail_free_buffers;
- }
+ cpu = raw_smp_processor_id();
+ cpumask_set_cpu(cpu, buffer->cpumask);
+ buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
+ if (!buffer->buffers[cpu])
+ goto fail_free_buffers;
-#ifdef CONFIG_HOTPLUG_CPU
- buffer->cpu_notify.notifier_call = rb_cpu_notify;
- buffer->cpu_notify.priority = 0;
- __register_cpu_notifier(&buffer->cpu_notify);
- cpu_notifier_register_done();
-#endif
+ ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
+ if (ret < 0)
+ goto fail_free_buffers;
mutex_init(&buffer->mutex);
@@ -1364,9 +1343,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
fail_free_cpumask:
free_cpumask_var(buffer->cpumask);
-#ifdef CONFIG_HOTPLUG_CPU
- cpu_notifier_register_done();
-#endif
fail_free_buffer:
kfree(buffer);
@@ -1383,18 +1359,11 @@ ring_buffer_free(struct ring_buffer *buffer)
{
int cpu;
-#ifdef CONFIG_HOTPLUG_CPU
- cpu_notifier_register_begin();
- __unregister_cpu_notifier(&buffer->cpu_notify);
-#endif
+ cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
for_each_buffer_cpu(buffer, cpu)
rb_free_cpu_buffer(buffer->buffers[cpu]);
-#ifdef CONFIG_HOTPLUG_CPU
- cpu_notifier_register_done();
-#endif
-
kfree(buffer->buffers);
free_cpumask_var(buffer->cpumask);
@@ -1829,48 +1798,48 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
}
EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
-static inline void *
+static __always_inline void *
__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
{
return bpage->data + index;
}
-static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
+static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
{
return bpage->page->data + index;
}
-static inline struct ring_buffer_event *
+static __always_inline struct ring_buffer_event *
rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
{
return __rb_page_index(cpu_buffer->reader_page,
cpu_buffer->reader_page->read);
}
-static inline struct ring_buffer_event *
+static __always_inline struct ring_buffer_event *
rb_iter_head_event(struct ring_buffer_iter *iter)
{
return __rb_page_index(iter->head_page, iter->head);
}
-static inline unsigned rb_page_commit(struct buffer_page *bpage)
+static __always_inline unsigned rb_page_commit(struct buffer_page *bpage)
{
return local_read(&bpage->page->commit);
}
/* Size is determined by what has been committed */
-static inline unsigned rb_page_size(struct buffer_page *bpage)
+static __always_inline unsigned rb_page_size(struct buffer_page *bpage)
{
return rb_page_commit(bpage);
}
-static inline unsigned
+static __always_inline unsigned
rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
{
return rb_page_commit(cpu_buffer->commit_page);
}
-static inline unsigned
+static __always_inline unsigned
rb_event_index(struct ring_buffer_event *event)
{
unsigned long addr = (unsigned long)event;
@@ -2386,7 +2355,7 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
local_inc(&cpu_buffer->commits);
}
-static void
+static __always_inline void
rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned long max_count;
@@ -2441,7 +2410,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
goto again;
}
-static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned long commits;
@@ -2486,7 +2455,7 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
event->time_delta = 1;
}
-static inline bool
+static __always_inline bool
rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
@@ -2500,7 +2469,7 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
rb_commit_index(cpu_buffer) == index;
}
-static void
+static __always_inline void
rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
@@ -2733,7 +2702,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
return event;
}
-static struct ring_buffer_event *
+static __always_inline struct ring_buffer_event *
rb_reserve_next_event(struct ring_buffer *buffer,
struct ring_buffer_per_cpu *cpu_buffer,
unsigned long length)
@@ -4633,62 +4602,48 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
-#ifdef CONFIG_HOTPLUG_CPU
-static int rb_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+/*
+ * We only allocate new buffers, never free them if the CPU goes down.
+ * If we were to free the buffer, then the user would lose any trace that was in
+ * the buffer.
+ */
+int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
{
- struct ring_buffer *buffer =
- container_of(self, struct ring_buffer, cpu_notify);
- long cpu = (long)hcpu;
+ struct ring_buffer *buffer;
long nr_pages_same;
int cpu_i;
unsigned long nr_pages;
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- if (cpumask_test_cpu(cpu, buffer->cpumask))
- return NOTIFY_OK;
-
- nr_pages = 0;
- nr_pages_same = 1;
- /* check if all cpu sizes are same */
- for_each_buffer_cpu(buffer, cpu_i) {
- /* fill in the size from first enabled cpu */
- if (nr_pages == 0)
- nr_pages = buffer->buffers[cpu_i]->nr_pages;
- if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
- nr_pages_same = 0;
- break;
- }
- }
- /* allocate minimum pages, user can later expand it */
- if (!nr_pages_same)
- nr_pages = 2;
- buffer->buffers[cpu] =
- rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
- if (!buffer->buffers[cpu]) {
- WARN(1, "failed to allocate ring buffer on CPU %ld\n",
- cpu);
- return NOTIFY_OK;
+ buffer = container_of(node, struct ring_buffer, node);
+ if (cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ nr_pages = 0;
+ nr_pages_same = 1;
+ /* check if all cpu sizes are same */
+ for_each_buffer_cpu(buffer, cpu_i) {
+ /* fill in the size from first enabled cpu */
+ if (nr_pages == 0)
+ nr_pages = buffer->buffers[cpu_i]->nr_pages;
+ if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
+ nr_pages_same = 0;
+ break;
}
- smp_wmb();
- cpumask_set_cpu(cpu, buffer->cpumask);
- break;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- /*
- * Do nothing.
- * If we were to free the buffer, then the user would
- * lose any trace that was in the buffer.
- */
- break;
- default:
- break;
}
- return NOTIFY_OK;
+ /* allocate minimum pages, user can later expand it */
+ if (!nr_pages_same)
+ nr_pages = 2;
+ buffer->buffers[cpu] =
+ rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
+ if (!buffer->buffers[cpu]) {
+ WARN(1, "failed to allocate ring buffer on CPU %u\n",
+ cpu);
+ return -ENOMEM;
+ }
+ smp_wmb();
+ cpumask_set_cpu(cpu, buffer->cpumask);
+ return 0;
}
-#endif
#ifdef CONFIG_RING_BUFFER_STARTUP_TEST
/*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7bc56762ca35..d7449783987a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -40,6 +40,7 @@
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/fs.h>
+#include <linux/trace.h>
#include <linux/sched/rt.h>
#include "trace.h"
@@ -68,6 +69,7 @@ bool __read_mostly tracing_selftest_disabled;
/* Pipe tracepoints to printk */
struct trace_iterator *tracepoint_print_iter;
int tracepoint_printk;
+static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key);
/* For tracers that don't implement custom flags */
static struct tracer_opt dummy_tracer_opt[] = {
@@ -234,7 +236,7 @@ static int __init set_tracepoint_printk(char *str)
}
__setup("tp_printk", set_tracepoint_printk);
-unsigned long long ns2usecs(cycle_t nsec)
+unsigned long long ns2usecs(u64 nsec)
{
nsec += 500;
do_div(nsec, 1000);
@@ -571,7 +573,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
return read;
}
-static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
+static u64 buffer_ftrace_now(struct trace_buffer *buf, int cpu)
{
u64 ts;
@@ -585,7 +587,7 @@ static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
return ts;
}
-cycle_t ftrace_now(int cpu)
+u64 ftrace_now(int cpu)
{
return buffer_ftrace_now(&global_trace.trace_buffer, cpu);
}
@@ -738,6 +740,31 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
#endif
+static __always_inline void
+trace_event_setup(struct ring_buffer_event *event,
+ int type, unsigned long flags, int pc)
+{
+ struct trace_entry *ent = ring_buffer_event_data(event);
+
+ tracing_generic_entry_update(ent, flags, pc);
+ ent->type = type;
+}
+
+static __always_inline struct ring_buffer_event *
+__trace_buffer_lock_reserve(struct ring_buffer *buffer,
+ int type,
+ unsigned long len,
+ unsigned long flags, int pc)
+{
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_lock_reserve(buffer, len);
+ if (event != NULL)
+ trace_event_setup(event, type, flags, pc);
+
+ return event;
+}
+
static void tracer_tracing_on(struct trace_array *tr)
{
if (tr->trace_buffer.buffer)
@@ -767,6 +794,22 @@ void tracing_on(void)
}
EXPORT_SYMBOL_GPL(tracing_on);
+
+static __always_inline void
+__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
+{
+ __this_cpu_write(trace_cmdline_save, true);
+
+ /* If this is the temp buffer, we need to commit fully */
+ if (this_cpu_read(trace_buffered_event) == event) {
+ /* Length is in event->array[0] */
+ ring_buffer_write(buffer, event->array[0], &event->array[1]);
+ /* Release the temp buffer */
+ this_cpu_dec(trace_buffered_event_cnt);
+ } else
+ ring_buffer_unlock_commit(buffer, event);
+}
+
/**
* __trace_puts - write a constant string into the trace buffer.
* @ip: The address of the caller
@@ -794,8 +837,8 @@ int __trace_puts(unsigned long ip, const char *str, int size)
local_save_flags(irq_flags);
buffer = global_trace.trace_buffer.buffer;
- event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
- irq_flags, pc);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
+ irq_flags, pc);
if (!event)
return 0;
@@ -842,8 +885,8 @@ int __trace_bputs(unsigned long ip, const char *str)
local_save_flags(irq_flags);
buffer = global_trace.trace_buffer.buffer;
- event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
- irq_flags, pc);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
+ irq_flags, pc);
if (!event)
return 0;
@@ -1047,7 +1090,7 @@ void disable_trace_on_warning(void)
*
* Shows real state of the ring buffer if it is enabled or not.
*/
-static int tracer_tracing_is_on(struct trace_array *tr)
+int tracer_tracing_is_on(struct trace_array *tr)
{
if (tr->trace_buffer.buffer)
return ring_buffer_record_is_on(tr->trace_buffer.buffer);
@@ -1125,6 +1168,7 @@ static struct {
{ trace_clock, "perf", 1 },
{ ktime_get_mono_fast_ns, "mono", 1 },
{ ktime_get_raw_fast_ns, "mono_raw", 1 },
+ { ktime_get_boot_fast_ns, "boot", 1 },
ARCH_TRACE_CLOCKS
};
@@ -1906,35 +1950,19 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
#endif
((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) |
((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
- ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+ ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
(tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
(test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
}
EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
-static __always_inline void
-trace_event_setup(struct ring_buffer_event *event,
- int type, unsigned long flags, int pc)
-{
- struct trace_entry *ent = ring_buffer_event_data(event);
-
- tracing_generic_entry_update(ent, flags, pc);
- ent->type = type;
-}
-
struct ring_buffer_event *
trace_buffer_lock_reserve(struct ring_buffer *buffer,
int type,
unsigned long len,
unsigned long flags, int pc)
{
- struct ring_buffer_event *event;
-
- event = ring_buffer_lock_reserve(buffer, len);
- if (event != NULL)
- trace_event_setup(event, type, flags, pc);
-
- return event;
+ return __trace_buffer_lock_reserve(buffer, type, len, flags, pc);
}
DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
@@ -2048,21 +2076,6 @@ void trace_buffered_event_disable(void)
preempt_enable();
}
-void
-__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
-{
- __this_cpu_write(trace_cmdline_save, true);
-
- /* If this is the temp buffer, we need to commit fully */
- if (this_cpu_read(trace_buffered_event) == event) {
- /* Length is in event->array[0] */
- ring_buffer_write(buffer, event->array[0], &event->array[1]);
- /* Release the temp buffer */
- this_cpu_dec(trace_buffered_event_cnt);
- } else
- ring_buffer_unlock_commit(buffer, event);
-}
-
static struct ring_buffer *temp_buffer;
struct ring_buffer_event *
@@ -2089,8 +2102,8 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
this_cpu_dec(trace_buffered_event_cnt);
}
- entry = trace_buffer_lock_reserve(*current_rb,
- type, len, flags, pc);
+ entry = __trace_buffer_lock_reserve(*current_rb,
+ type, len, flags, pc);
/*
* If tracing is off, but we have triggers enabled
* we still need to look at the event data. Use the temp_buffer
@@ -2099,13 +2112,88 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
*/
if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
*current_rb = temp_buffer;
- entry = trace_buffer_lock_reserve(*current_rb,
- type, len, flags, pc);
+ entry = __trace_buffer_lock_reserve(*current_rb,
+ type, len, flags, pc);
}
return entry;
}
EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
+static DEFINE_SPINLOCK(tracepoint_iter_lock);
+static DEFINE_MUTEX(tracepoint_printk_mutex);
+
+static void output_printk(struct trace_event_buffer *fbuffer)
+{
+ struct trace_event_call *event_call;
+ struct trace_event *event;
+ unsigned long flags;
+ struct trace_iterator *iter = tracepoint_print_iter;
+
+ /* We should never get here if iter is NULL */
+ if (WARN_ON_ONCE(!iter))
+ return;
+
+ event_call = fbuffer->trace_file->event_call;
+ if (!event_call || !event_call->event.funcs ||
+ !event_call->event.funcs->trace)
+ return;
+
+ event = &fbuffer->trace_file->event_call->event;
+
+ spin_lock_irqsave(&tracepoint_iter_lock, flags);
+ trace_seq_init(&iter->seq);
+ iter->ent = fbuffer->entry;
+ event_call->event.funcs->trace(iter, 0, event);
+ trace_seq_putc(&iter->seq, 0);
+ printk("%s", iter->seq.buffer);
+
+ spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
+}
+
+int tracepoint_printk_sysctl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int save_tracepoint_printk;
+ int ret;
+
+ mutex_lock(&tracepoint_printk_mutex);
+ save_tracepoint_printk = tracepoint_printk;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ /*
+ * This will force exiting early, as tracepoint_printk
+ * is always zero when tracepoint_printk_iter is not allocated
+ */
+ if (!tracepoint_print_iter)
+ tracepoint_printk = 0;
+
+ if (save_tracepoint_printk == tracepoint_printk)
+ goto out;
+
+ if (tracepoint_printk)
+ static_key_enable(&tracepoint_printk_key.key);
+ else
+ static_key_disable(&tracepoint_printk_key.key);
+
+ out:
+ mutex_unlock(&tracepoint_printk_mutex);
+
+ return ret;
+}
+
+void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
+{
+ if (static_key_false(&tracepoint_printk_key.key))
+ output_printk(fbuffer);
+
+ event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer,
+ fbuffer->event, fbuffer->entry,
+ fbuffer->flags, fbuffer->pc);
+}
+EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
+
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
struct ring_buffer *buffer,
struct ring_buffer_event *event,
@@ -2128,6 +2216,139 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
ftrace_trace_userstack(buffer, flags, pc);
}
+/*
+ * Similar to trace_buffer_unlock_commit_regs() but do not dump stack.
+ */
+void
+trace_buffer_unlock_commit_nostack(struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ __buffer_unlock_commit(buffer, event);
+}
+
+static void
+trace_process_export(struct trace_export *export,
+ struct ring_buffer_event *event)
+{
+ struct trace_entry *entry;
+ unsigned int size = 0;
+
+ entry = ring_buffer_event_data(event);
+ size = ring_buffer_event_length(event);
+ export->write(entry, size);
+}
+
+static DEFINE_MUTEX(ftrace_export_lock);
+
+static struct trace_export __rcu *ftrace_exports_list __read_mostly;
+
+static DEFINE_STATIC_KEY_FALSE(ftrace_exports_enabled);
+
+static inline void ftrace_exports_enable(void)
+{
+ static_branch_enable(&ftrace_exports_enabled);
+}
+
+static inline void ftrace_exports_disable(void)
+{
+ static_branch_disable(&ftrace_exports_enabled);
+}
+
+void ftrace_exports(struct ring_buffer_event *event)
+{
+ struct trace_export *export;
+
+ preempt_disable_notrace();
+
+ export = rcu_dereference_raw_notrace(ftrace_exports_list);
+ while (export) {
+ trace_process_export(export, event);
+ export = rcu_dereference_raw_notrace(export->next);
+ }
+
+ preempt_enable_notrace();
+}
+
+static inline void
+add_trace_export(struct trace_export **list, struct trace_export *export)
+{
+ rcu_assign_pointer(export->next, *list);
+ /*
+ * We are entering export into the list but another
+ * CPU might be walking that list. We need to make sure
+ * the export->next pointer is valid before another CPU sees
+ * the export pointer included into the list.
+ */
+ rcu_assign_pointer(*list, export);
+}
+
+static inline int
+rm_trace_export(struct trace_export **list, struct trace_export *export)
+{
+ struct trace_export **p;
+
+ for (p = list; *p != NULL; p = &(*p)->next)
+ if (*p == export)
+ break;
+
+ if (*p != export)
+ return -1;
+
+ rcu_assign_pointer(*p, (*p)->next);
+
+ return 0;
+}
+
+static inline void
+add_ftrace_export(struct trace_export **list, struct trace_export *export)
+{
+ if (*list == NULL)
+ ftrace_exports_enable();
+
+ add_trace_export(list, export);
+}
+
+static inline int
+rm_ftrace_export(struct trace_export **list, struct trace_export *export)
+{
+ int ret;
+
+ ret = rm_trace_export(list, export);
+ if (*list == NULL)
+ ftrace_exports_disable();
+
+ return ret;
+}
+
+int register_ftrace_export(struct trace_export *export)
+{
+ if (WARN_ON_ONCE(!export->write))
+ return -1;
+
+ mutex_lock(&ftrace_export_lock);
+
+ add_ftrace_export(&ftrace_exports_list, export);
+
+ mutex_unlock(&ftrace_export_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_ftrace_export);
+
+int unregister_ftrace_export(struct trace_export *export)
+{
+ int ret;
+
+ mutex_lock(&ftrace_export_lock);
+
+ ret = rm_ftrace_export(&ftrace_exports_list, export);
+
+ mutex_unlock(&ftrace_export_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_ftrace_export);
+
void
trace_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -2138,16 +2359,19 @@ trace_function(struct trace_array *tr,
struct ring_buffer_event *event;
struct ftrace_entry *entry;
- event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
- flags, pc);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
+ flags, pc);
if (!event)
return;
entry = ring_buffer_event_data(event);
entry->ip = ip;
entry->parent_ip = parent_ip;
- if (!call_filter_check_discard(call, entry, buffer, event))
+ if (!call_filter_check_discard(call, entry, buffer, event)) {
+ if (static_branch_unlikely(&ftrace_exports_enabled))
+ ftrace_exports(event);
__buffer_unlock_commit(buffer, event);
+ }
}
#ifdef CONFIG_STACKTRACE
@@ -2215,8 +2439,8 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
size *= sizeof(unsigned long);
- event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
- sizeof(*entry) + size, flags, pc);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
+ sizeof(*entry) + size, flags, pc);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -2317,8 +2541,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
__this_cpu_inc(user_stack_count);
- event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
- sizeof(*entry), flags, pc);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
+ sizeof(*entry), flags, pc);
if (!event)
goto out_drop_count;
entry = ring_buffer_event_data(event);
@@ -2488,8 +2712,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
local_save_flags(flags);
size = sizeof(*entry) + sizeof(u32) * len;
buffer = tr->trace_buffer.buffer;
- event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
- flags, pc);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
+ flags, pc);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -2544,8 +2768,8 @@ __trace_array_vprintk(struct ring_buffer *buffer,
local_save_flags(flags);
size = sizeof(*entry) + len + 1;
- event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
- flags, pc);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+ flags, pc);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -4054,6 +4278,7 @@ static const char readme_msg[] =
" x86-tsc: TSC cycle counter\n"
#endif
"\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
+ "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n"
" tracing_cpumask\t- Limit which CPUs to trace\n"
" instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
"\t\t\t Remove sub-buffer with rmdir\n"
@@ -4065,7 +4290,7 @@ static const char readme_msg[] =
"\n available_filter_functions - list of functions that can be filtered on\n"
" set_ftrace_filter\t- echo function name in here to only trace these\n"
"\t\t\t functions\n"
- "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
+ "\t accepts: func_full_name or glob-matching-pattern\n"
"\t modules: Can select a group via module\n"
"\t Format: :mod:<module-name>\n"
"\t example: echo :mod:ext3 > set_ftrace_filter\n"
@@ -4123,6 +4348,30 @@ static const char readme_msg[] =
"\t\t\t traces\n"
#endif
#endif /* CONFIG_STACK_TRACER */
+#ifdef CONFIG_KPROBE_EVENT
+ " kprobe_events\t\t- Add/remove/show the kernel dynamic events\n"
+ "\t\t\t Write into this file to define/undefine new trace events.\n"
+#endif
+#ifdef CONFIG_UPROBE_EVENT
+ " uprobe_events\t\t- Add/remove/show the userspace dynamic events\n"
+ "\t\t\t Write into this file to define/undefine new trace events.\n"
+#endif
+#if defined(CONFIG_KPROBE_EVENT) || defined(CONFIG_UPROBE_EVENT)
+ "\t accepts: event-definitions (one definition per line)\n"
+ "\t Format: p|r[:[<group>/]<event>] <place> [<args>]\n"
+ "\t -:[<group>/]<event>\n"
+#ifdef CONFIG_KPROBE_EVENT
+ "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
+#endif
+#ifdef CONFIG_UPROBE_EVENT
+ "\t place: <path>:<offset>\n"
+#endif
+ "\t args: <name>=fetcharg[:type]\n"
+ "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n"
+ "\t $stack<index>, $stack, $retval, $comm\n"
+ "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string,\n"
+ "\t b<bit-width>@<bit-offset>/<container-size>\n"
+#endif
" events/\t\t- Directory containing all trace event subsystems:\n"
" enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
" events/<system>/\t- Directory containing all trace events for <system>:\n"
@@ -4945,7 +5194,7 @@ out:
return ret;
}
-#ifdef CONFIG_TRACER_MAX_TRACE
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
static ssize_t
tracing_max_lat_read(struct file *filp, char __user *ubuf,
@@ -5494,21 +5743,18 @@ static ssize_t
tracing_mark_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
{
- unsigned long addr = (unsigned long)ubuf;
struct trace_array *tr = filp->private_data;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
struct print_entry *entry;
unsigned long irq_flags;
- struct page *pages[2];
- void *map_page[2];
- int nr_pages = 1;
+ const char faulted[] = "<faulted>";
ssize_t written;
- int offset;
int size;
int len;
- int ret;
- int i;
+
+/* Used in tracing_mark_raw_write() as well */
+#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */
if (tracing_disabled)
return -EINVAL;
@@ -5519,60 +5765,33 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (cnt > TRACE_BUF_SIZE)
cnt = TRACE_BUF_SIZE;
- /*
- * Userspace is injecting traces into the kernel trace buffer.
- * We want to be as non intrusive as possible.
- * To do so, we do not want to allocate any special buffers
- * or take any locks, but instead write the userspace data
- * straight into the ring buffer.
- *
- * First we need to pin the userspace buffer into memory,
- * which, most likely it is, because it just referenced it.
- * But there's no guarantee that it is. By using get_user_pages_fast()
- * and kmap_atomic/kunmap_atomic() we can get access to the
- * pages directly. We then write the data directly into the
- * ring buffer.
- */
BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
- /* check if we cross pages */
- if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
- nr_pages = 2;
-
- offset = addr & (PAGE_SIZE - 1);
- addr &= PAGE_MASK;
-
- ret = get_user_pages_fast(addr, nr_pages, 0, pages);
- if (ret < nr_pages) {
- while (--ret >= 0)
- put_page(pages[ret]);
- written = -EFAULT;
- goto out;
- }
+ local_save_flags(irq_flags);
+ size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
- for (i = 0; i < nr_pages; i++)
- map_page[i] = kmap_atomic(pages[i]);
+ /* If less than "<faulted>", then make sure we can still add that */
+ if (cnt < FAULTED_SIZE)
+ size += FAULTED_SIZE - cnt;
- local_save_flags(irq_flags);
- size = sizeof(*entry) + cnt + 2; /* possible \n added */
buffer = tr->trace_buffer.buffer;
- event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
- irq_flags, preempt_count());
- if (!event) {
+ event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+ irq_flags, preempt_count());
+ if (unlikely(!event))
/* Ring buffer disabled, return as if not open for write */
- written = -EBADF;
- goto out_unlock;
- }
+ return -EBADF;
entry = ring_buffer_event_data(event);
entry->ip = _THIS_IP_;
- if (nr_pages == 2) {
- len = PAGE_SIZE - offset;
- memcpy(&entry->buf, map_page[0] + offset, len);
- memcpy(&entry->buf[len], map_page[1], cnt - len);
+ len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt);
+ if (len) {
+ memcpy(&entry->buf, faulted, FAULTED_SIZE);
+ cnt = FAULTED_SIZE;
+ written = -EFAULT;
} else
- memcpy(&entry->buf, map_page[0] + offset, cnt);
+ written = cnt;
+ len = cnt;
if (entry->buf[cnt - 1] != '\n') {
entry->buf[cnt] = '\n';
@@ -5582,16 +5801,73 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
__buffer_unlock_commit(buffer, event);
- written = cnt;
+ if (written > 0)
+ *fpos += written;
- *fpos += written;
+ return written;
+}
+
+/* Limit it for now to 3K (including tag) */
+#define RAW_DATA_MAX_SIZE (1024*3)
+
+static ssize_t
+tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *fpos)
+{
+ struct trace_array *tr = filp->private_data;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ struct raw_data_entry *entry;
+ const char faulted[] = "<faulted>";
+ unsigned long irq_flags;
+ ssize_t written;
+ int size;
+ int len;
+
+#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int))
+
+ if (tracing_disabled)
+ return -EINVAL;
+
+ if (!(tr->trace_flags & TRACE_ITER_MARKERS))
+ return -EINVAL;
+
+ /* The marker must at least have a tag id */
+ if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE)
+ return -EINVAL;
+
+ if (cnt > TRACE_BUF_SIZE)
+ cnt = TRACE_BUF_SIZE;
+
+ BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
+
+ local_save_flags(irq_flags);
+ size = sizeof(*entry) + cnt;
+ if (cnt < FAULT_SIZE_ID)
+ size += FAULT_SIZE_ID - cnt;
+
+ buffer = tr->trace_buffer.buffer;
+ event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
+ irq_flags, preempt_count());
+ if (!event)
+ /* Ring buffer disabled, return as if not open for write */
+ return -EBADF;
+
+ entry = ring_buffer_event_data(event);
+
+ len = __copy_from_user_inatomic(&entry->id, ubuf, cnt);
+ if (len) {
+ entry->id = -1;
+ memcpy(&entry->buf, faulted, FAULTED_SIZE);
+ written = -EFAULT;
+ } else
+ written = cnt;
+
+ __buffer_unlock_commit(buffer, event);
+
+ if (written > 0)
+ *fpos += written;
- out_unlock:
- for (i = nr_pages - 1; i >= 0; i--) {
- kunmap_atomic(map_page[i]);
- put_page(pages[i]);
- }
- out:
return written;
}
@@ -5868,7 +6144,7 @@ static const struct file_operations tracing_thresh_fops = {
.llseek = generic_file_llseek,
};
-#ifdef CONFIG_TRACER_MAX_TRACE
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
static const struct file_operations tracing_max_lat_fops = {
.open = tracing_open_generic,
.read = tracing_max_lat_read,
@@ -5921,6 +6197,13 @@ static const struct file_operations tracing_mark_fops = {
.release = tracing_release_generic_tr,
};
+static const struct file_operations tracing_mark_raw_fops = {
+ .open = tracing_open_generic_tr,
+ .write = tracing_mark_raw_write,
+ .llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
+};
+
static const struct file_operations trace_clock_fops = {
.open = tracing_clock_open,
.read = seq_read,
@@ -7190,6 +7473,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_marker", 0220, d_tracer,
tr, &tracing_mark_fops);
+ trace_create_file("trace_marker_raw", 0220, d_tracer,
+ tr, &tracing_mark_raw_fops);
+
trace_create_file("trace_clock", 0644, d_tracer, tr,
&trace_clock_fops);
@@ -7198,7 +7484,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
create_trace_options_dir(tr);
-#ifdef CONFIG_TRACER_MAX_TRACE
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
trace_create_file("tracing_max_latency", 0644, d_tracer,
&tr->max_latency, &tracing_max_lat_fops);
#endif
@@ -7635,10 +7921,21 @@ __init static int tracer_alloc_buffers(void)
raw_spin_lock_init(&global_trace.start_lock);
+ /*
+ * The prepare callbacks allocates some memory for the ring buffer. We
+ * don't free the buffer if the if the CPU goes down. If we were to free
+ * the buffer, then the user would lose any trace that was in the
+ * buffer. The memory will be removed once the "instance" is removed.
+ */
+ ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE,
+ "trace/RB:preapre", trace_rb_cpu_prepare,
+ NULL);
+ if (ret < 0)
+ goto out_free_cpumask;
/* Used for event triggers */
temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
if (!temp_buffer)
- goto out_free_cpumask;
+ goto out_rm_hp_state;
if (trace_create_savedcmd() < 0)
goto out_free_temp_buffer;
@@ -7699,6 +7996,8 @@ out_free_savedcmd:
free_saved_cmdlines_buffer(savedcmd);
out_free_temp_buffer:
ring_buffer_free(temp_buffer);
+out_rm_hp_state:
+ cpuhp_remove_multi_state(CPUHP_TRACE_RB_PREPARE);
out_free_cpumask:
free_cpumask_var(global_trace.tracing_cpumask);
out_free_buffer_mask:
@@ -7714,6 +8013,8 @@ void __init trace_init(void)
kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
if (WARN_ON(!tracepoint_print_iter))
tracepoint_printk = 0;
+ else
+ static_key_enable(&tracepoint_printk_key.key);
}
tracer_alloc_buffers();
trace_event_init();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f783df416726..1ea51ab53edf 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -15,6 +15,7 @@
#include <linux/trace_events.h>
#include <linux/compiler.h>
#include <linux/trace_seq.h>
+#include <linux/glob.h>
#ifdef CONFIG_FTRACE_SYSCALLS
#include <asm/unistd.h> /* For NR_SYSCALLS */
@@ -38,6 +39,8 @@ enum trace_type {
TRACE_USER_STACK,
TRACE_BLK,
TRACE_BPUTS,
+ TRACE_HWLAT,
+ TRACE_RAW_DATA,
__TRACE_LAST_TYPE,
};
@@ -156,7 +159,7 @@ struct trace_array_cpu {
unsigned long policy;
unsigned long rt_priority;
unsigned long skipped_entries;
- cycle_t preempt_timestamp;
+ u64 preempt_timestamp;
pid_t pid;
kuid_t uid;
char comm[TASK_COMM_LEN];
@@ -174,7 +177,7 @@ struct trace_buffer {
struct trace_array *tr;
struct ring_buffer *buffer;
struct trace_array_cpu __percpu *data;
- cycle_t time_start;
+ u64 time_start;
int cpu;
};
@@ -213,6 +216,8 @@ struct trace_array {
*/
struct trace_buffer max_buffer;
bool allocated_snapshot;
+#endif
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
unsigned long max_latency;
#endif
struct trace_pid_list __rcu *filtered_pids;
@@ -326,6 +331,8 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \
+ IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \
+ IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\
IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
TRACE_MMIO_RW); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
@@ -571,6 +578,7 @@ void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
+int tracer_tracing_is_on(struct trace_array *tr);
struct dentry *trace_create_file(const char *name,
umode_t mode,
struct dentry *parent,
@@ -594,8 +602,8 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
int *ent_cpu, u64 *ent_ts);
-void __buffer_unlock_commit(struct ring_buffer *buffer,
- struct ring_buffer_event *event);
+void trace_buffer_unlock_commit_nostack(struct ring_buffer *buffer,
+ struct ring_buffer_event *event);
int trace_empty(struct trace_iterator *iter);
@@ -681,7 +689,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
}
#endif /* CONFIG_STACKTRACE */
-extern cycle_t ftrace_now(int cpu);
+extern u64 ftrace_now(int cpu);
extern void trace_find_cmdline(int pid, char comm[]);
extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
@@ -728,7 +736,7 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
#endif /* CONFIG_FTRACE_STARTUP_TEST */
extern void *head_page(struct trace_array_cpu *data);
-extern unsigned long long ns2usecs(cycle_t nsec);
+extern unsigned long long ns2usecs(u64 nsec);
extern int
trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
extern int
@@ -838,6 +846,17 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)
return 0;
}
#endif /* CONFIG_DYNAMIC_FTRACE */
+
+extern unsigned int fgraph_max_depth;
+
+static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace)
+{
+ /* trace it when it is-nested-in or is a function enabled. */
+ return !(trace->depth || ftrace_graph_addr(trace->func)) ||
+ (trace->depth < 0) ||
+ (fgraph_max_depth && trace->depth >= fgraph_max_depth);
+}
+
#else /* CONFIG_FUNCTION_GRAPH_TRACER */
static inline enum print_line_t
print_graph_function_flags(struct trace_iterator *iter, u32 flags)
@@ -1252,6 +1271,7 @@ enum regex_type {
MATCH_FRONT_ONLY,
MATCH_MIDDLE_ONLY,
MATCH_END_ONLY,
+ MATCH_GLOB,
};
struct regex {
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 0f109c4130d3..e3b488825ae3 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -21,6 +21,8 @@ static u64 bm_stddev;
static unsigned int bm_avg;
static unsigned int bm_std;
+static bool ok_to_run;
+
/*
* This gets called in a loop recording the time it took to write
* the tracepoint. What it writes is the time statistics of the last
@@ -164,11 +166,21 @@ static int benchmark_event_kthread(void *arg)
* When the benchmark tracepoint is enabled, it calls this
* function and the thread that calls the tracepoint is created.
*/
-void trace_benchmark_reg(void)
+int trace_benchmark_reg(void)
{
+ if (!ok_to_run) {
+ pr_warning("trace benchmark cannot be started via kernel command line\n");
+ return -EBUSY;
+ }
+
bm_event_thread = kthread_run(benchmark_event_kthread,
NULL, "event_benchmark");
- WARN_ON(!bm_event_thread);
+ if (!bm_event_thread) {
+ pr_warning("trace benchmark failed to create kernel thread\n");
+ return -ENOMEM;
+ }
+
+ return 0;
}
/*
@@ -182,6 +194,7 @@ void trace_benchmark_unreg(void)
return;
kthread_stop(bm_event_thread);
+ bm_event_thread = NULL;
strcpy(bm_str, "START");
bm_total = 0;
@@ -196,3 +209,12 @@ void trace_benchmark_unreg(void)
bm_avg = 0;
bm_stddev = 0;
}
+
+static __init int ok_to_run_trace_benchmark(void)
+{
+ ok_to_run = true;
+
+ return 0;
+}
+
+early_initcall(ok_to_run_trace_benchmark);
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h
index 3c1df1df4e29..ebdbfc2f2a64 100644
--- a/kernel/trace/trace_benchmark.h
+++ b/kernel/trace/trace_benchmark.h
@@ -6,7 +6,7 @@
#include <linux/tracepoint.h>
-extern void trace_benchmark_reg(void);
+extern int trace_benchmark_reg(void);
extern void trace_benchmark_unreg(void);
#define BENCHMARK_EVENT_STRLEN 128
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 3a2a73716a5b..75489de546b6 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -81,7 +81,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
entry->correct = val == expect;
if (!call_filter_check_discard(call, entry, buffer, event))
- __buffer_unlock_commit(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
out:
current->trace_recursion &= ~TRACE_BRANCH_BIT;
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 5c30efcda5e6..eb7396b7e7c3 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -244,6 +244,21 @@ FTRACE_ENTRY(print, print_entry,
FILTER_OTHER
);
+FTRACE_ENTRY(raw_data, raw_data_entry,
+
+ TRACE_RAW_DATA,
+
+ F_STRUCT(
+ __field( unsigned int, id )
+ __dynamic_array( char, buf )
+ ),
+
+ F_printk("id:%04x %08x",
+ __entry->id, (int)__entry->buf[0]),
+
+ FILTER_OTHER
+);
+
FTRACE_ENTRY(bputs, bputs_entry,
TRACE_BPUTS,
@@ -322,3 +337,30 @@ FTRACE_ENTRY(branch, trace_branch,
FILTER_OTHER
);
+
+FTRACE_ENTRY(hwlat, hwlat_entry,
+
+ TRACE_HWLAT,
+
+ F_STRUCT(
+ __field( u64, duration )
+ __field( u64, outer_duration )
+ __field( u64, nmi_total_ts )
+ __field_struct( struct timespec, timestamp )
+ __field_desc( long, timestamp, tv_sec )
+ __field_desc( long, timestamp, tv_nsec )
+ __field( unsigned int, nmi_count )
+ __field( unsigned int, seqnum )
+ ),
+
+ F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n",
+ __entry->seqnum,
+ __entry->tv_sec,
+ __entry->tv_nsec,
+ __entry->duration,
+ __entry->outer_duration,
+ __entry->nmi_total_ts,
+ __entry->nmi_count),
+
+ FILTER_OTHER
+);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 03c0a48c3ac4..93116549a284 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -283,46 +283,6 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
}
EXPORT_SYMBOL_GPL(trace_event_buffer_reserve);
-static DEFINE_SPINLOCK(tracepoint_iter_lock);
-
-static void output_printk(struct trace_event_buffer *fbuffer)
-{
- struct trace_event_call *event_call;
- struct trace_event *event;
- unsigned long flags;
- struct trace_iterator *iter = tracepoint_print_iter;
-
- if (!iter)
- return;
-
- event_call = fbuffer->trace_file->event_call;
- if (!event_call || !event_call->event.funcs ||
- !event_call->event.funcs->trace)
- return;
-
- event = &fbuffer->trace_file->event_call->event;
-
- spin_lock_irqsave(&tracepoint_iter_lock, flags);
- trace_seq_init(&iter->seq);
- iter->ent = fbuffer->entry;
- event_call->event.funcs->trace(iter, 0, event);
- trace_seq_putc(&iter->seq, 0);
- printk("%s", iter->seq.buffer);
-
- spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
-}
-
-void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
-{
- if (tracepoint_printk)
- output_printk(fbuffer);
-
- event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer,
- fbuffer->event, fbuffer->entry,
- fbuffer->flags, fbuffer->pc);
-}
-EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
-
int trace_event_reg(struct trace_event_call *call,
enum trace_reg type, void *data)
{
@@ -742,6 +702,7 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
struct trace_event_call *call;
const char *name;
int ret = -EINVAL;
+ int eret = 0;
list_for_each_entry(file, &tr->events, list) {
@@ -765,9 +726,17 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
if (event && strcmp(event, name) != 0)
continue;
- ftrace_event_enable_disable(file, set);
+ ret = ftrace_event_enable_disable(file, set);
- ret = 0;
+ /*
+ * Save the first error and return that. Some events
+ * may still have been enabled, but let the user
+ * know that something went wrong.
+ */
+ if (ret && !eret)
+ eret = ret;
+
+ ret = eret;
}
return ret;
@@ -2843,20 +2812,32 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
return -ENOMEM;
}
+ entry = trace_create_file("enable", 0644, d_events,
+ tr, &ftrace_tr_enable_fops);
+ if (!entry) {
+ pr_warn("Could not create tracefs 'enable' entry\n");
+ return -ENOMEM;
+ }
+
+ /* There are not as crucial, just warn if they are not created */
+
entry = tracefs_create_file("set_event_pid", 0644, parent,
tr, &ftrace_set_event_pid_fops);
+ if (!entry)
+ pr_warn("Could not create tracefs 'set_event_pid' entry\n");
/* ring buffer internal formats */
- trace_create_file("header_page", 0444, d_events,
- ring_buffer_print_page_header,
- &ftrace_show_header_fops);
-
- trace_create_file("header_event", 0444, d_events,
- ring_buffer_print_entry_header,
- &ftrace_show_header_fops);
+ entry = trace_create_file("header_page", 0444, d_events,
+ ring_buffer_print_page_header,
+ &ftrace_show_header_fops);
+ if (!entry)
+ pr_warn("Could not create tracefs 'header_page' entry\n");
- trace_create_file("enable", 0644, d_events,
- tr, &ftrace_tr_enable_fops);
+ entry = trace_create_file("header_event", 0444, d_events,
+ ring_buffer_print_entry_header,
+ &ftrace_show_header_fops);
+ if (!entry)
+ pr_warn("Could not create tracefs 'header_event' entry\n");
tr->event_dir = d_events;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 9daa9b3bc6d9..59a411ff60c7 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -108,12 +108,12 @@ static char *err_text[] = {
};
struct opstack_op {
- int op;
+ enum filter_op_ids op;
struct list_head list;
};
struct postfix_elt {
- int op;
+ enum filter_op_ids op;
char *operand;
struct list_head list;
};
@@ -145,34 +145,50 @@ struct pred_stack {
/* If not of not match is equal to not of not, then it is a match */
#define DEFINE_COMPARISON_PRED(type) \
-static int filter_pred_##type(struct filter_pred *pred, void *event) \
+static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
- int match = 0; \
- \
- switch (pred->op) { \
- case OP_LT: \
- match = (*addr < val); \
- break; \
- case OP_LE: \
- match = (*addr <= val); \
- break; \
- case OP_GT: \
- match = (*addr > val); \
- break; \
- case OP_GE: \
- match = (*addr >= val); \
- break; \
- case OP_BAND: \
- match = (*addr & val); \
- break; \
- default: \
- break; \
- } \
- \
+ int match = (*addr < val); \
return !!match == !pred->not; \
-}
+} \
+static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \
+{ \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ int match = (*addr <= val); \
+ return !!match == !pred->not; \
+} \
+static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \
+{ \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ int match = (*addr > val); \
+ return !!match == !pred->not; \
+} \
+static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \
+{ \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ int match = (*addr >= val); \
+ return !!match == !pred->not; \
+} \
+static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \
+{ \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ int match = !!(*addr & val); \
+ return match == !pred->not; \
+} \
+static const filter_pred_fn_t pred_funcs_##type[] = { \
+ filter_pred_LT_##type, \
+ filter_pred_LE_##type, \
+ filter_pred_GT_##type, \
+ filter_pred_GE_##type, \
+ filter_pred_BAND_##type, \
+};
+
+#define PRED_FUNC_START OP_LT
#define DEFINE_EQUALITY_PRED(size) \
static int filter_pred_##size(struct filter_pred *pred, void *event) \
@@ -344,6 +360,12 @@ static int regex_match_end(char *str, struct regex *r, int len)
return 0;
}
+static int regex_match_glob(char *str, struct regex *r, int len __maybe_unused)
+{
+ if (glob_match(r->pattern, str))
+ return 1;
+ return 0;
+}
/**
* filter_parse_regex - parse a basic regex
* @buff: the raw regex
@@ -380,14 +402,20 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
if (!i) {
*search = buff + 1;
type = MATCH_END_ONLY;
- } else {
+ } else if (i == len - 1) {
if (type == MATCH_END_ONLY)
type = MATCH_MIDDLE_ONLY;
else
type = MATCH_FRONT_ONLY;
buff[i] = 0;
break;
+ } else { /* pattern continues, use full glob */
+ type = MATCH_GLOB;
+ break;
}
+ } else if (strchr("[?\\", buff[i])) {
+ type = MATCH_GLOB;
+ break;
}
}
@@ -420,6 +448,9 @@ static void filter_build_regex(struct filter_pred *pred)
case MATCH_END_ONLY:
r->match = regex_match_end;
break;
+ case MATCH_GLOB:
+ r->match = regex_match_glob;
+ break;
}
pred->not ^= not;
@@ -946,7 +977,7 @@ int filter_assign_type(const char *type)
return FILTER_OTHER;
}
-static bool is_legal_op(struct ftrace_event_field *field, int op)
+static bool is_legal_op(struct ftrace_event_field *field, enum filter_op_ids op)
{
if (is_string_field(field) &&
(op != OP_EQ && op != OP_NE && op != OP_GLOB))
@@ -957,8 +988,8 @@ static bool is_legal_op(struct ftrace_event_field *field, int op)
return true;
}
-static filter_pred_fn_t select_comparison_fn(int op, int field_size,
- int field_is_signed)
+static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
+ int field_size, int field_is_signed)
{
filter_pred_fn_t fn = NULL;
@@ -967,33 +998,33 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
if (op == OP_EQ || op == OP_NE)
fn = filter_pred_64;
else if (field_is_signed)
- fn = filter_pred_s64;
+ fn = pred_funcs_s64[op - PRED_FUNC_START];
else
- fn = filter_pred_u64;
+ fn = pred_funcs_u64[op - PRED_FUNC_START];
break;
case 4:
if (op == OP_EQ || op == OP_NE)
fn = filter_pred_32;
else if (field_is_signed)
- fn = filter_pred_s32;
+ fn = pred_funcs_s32[op - PRED_FUNC_START];
else
- fn = filter_pred_u32;
+ fn = pred_funcs_u32[op - PRED_FUNC_START];
break;
case 2:
if (op == OP_EQ || op == OP_NE)
fn = filter_pred_16;
else if (field_is_signed)
- fn = filter_pred_s16;
+ fn = pred_funcs_s16[op - PRED_FUNC_START];
else
- fn = filter_pred_u16;
+ fn = pred_funcs_u16[op - PRED_FUNC_START];
break;
case 1:
if (op == OP_EQ || op == OP_NE)
fn = filter_pred_8;
else if (field_is_signed)
- fn = filter_pred_s8;
+ fn = pred_funcs_s8[op - PRED_FUNC_START];
else
- fn = filter_pred_u8;
+ fn = pred_funcs_u8[op - PRED_FUNC_START];
break;
}
@@ -1166,7 +1197,8 @@ static inline int append_operand_char(struct filter_parse_state *ps, char c)
return 0;
}
-static int filter_opstack_push(struct filter_parse_state *ps, int op)
+static int filter_opstack_push(struct filter_parse_state *ps,
+ enum filter_op_ids op)
{
struct opstack_op *opstack_op;
@@ -1200,7 +1232,7 @@ static int filter_opstack_top(struct filter_parse_state *ps)
static int filter_opstack_pop(struct filter_parse_state *ps)
{
struct opstack_op *opstack_op;
- int op;
+ enum filter_op_ids op;
if (filter_opstack_empty(ps))
return OP_NONE;
@@ -1245,7 +1277,7 @@ static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
return 0;
}
-static int postfix_append_op(struct filter_parse_state *ps, int op)
+static int postfix_append_op(struct filter_parse_state *ps, enum filter_op_ids op)
{
struct postfix_elt *elt;
@@ -1275,8 +1307,8 @@ static void postfix_clear(struct filter_parse_state *ps)
static int filter_parse(struct filter_parse_state *ps)
{
+ enum filter_op_ids op, top_op;
int in_string = 0;
- int op, top_op;
char ch;
while ((ch = infix_next(ps))) {
@@ -1367,7 +1399,8 @@ parse_operand:
static struct filter_pred *create_pred(struct filter_parse_state *ps,
struct trace_event_call *call,
- int op, char *operand1, char *operand2)
+ enum filter_op_ids op,
+ char *operand1, char *operand2)
{
struct ftrace_event_field *field;
static struct filter_pred pred;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index a975571cde24..6721a1e89f39 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1028,6 +1028,7 @@ static struct event_command trigger_traceon_cmd = {
static struct event_command trigger_traceoff_cmd = {
.name = "traceoff",
.trigger_type = ETT_TRACE_ONOFF,
+ .flags = EVENT_CMD_FL_POST_TRIGGER,
.func = event_trigger_callback,
.reg = register_trigger,
.unreg = unregister_trigger,
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 7363ccf79512..d56123cdcc89 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -65,7 +65,7 @@ struct fgraph_data {
#define TRACE_GRAPH_INDENT 2
-static unsigned int max_depth;
+unsigned int fgraph_max_depth;
static struct tracer_opt trace_opts[] = {
/* Display overruns? (for self-debug purpose) */
@@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
/* Add a function return address to the trace stack on thread info.*/
int
ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
- unsigned long frame_pointer)
+ unsigned long frame_pointer, unsigned long *retp)
{
unsigned long long calltime;
int index;
@@ -170,8 +170,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
current->ret_stack[index].ret = ret;
current->ret_stack[index].func = func;
current->ret_stack[index].calltime = calltime;
- current->ret_stack[index].subtime = 0;
+#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
current->ret_stack[index].fp = frame_pointer;
+#endif
+#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+ current->ret_stack[index].retp = retp;
+#endif
*depth = current->curr_ret_stack;
return 0;
@@ -204,7 +208,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
return;
}
-#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY)
+#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
/*
* The arch may choose to record the frame pointer used
* and check it here to make sure that it is what we expect it
@@ -279,6 +283,64 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
return ret;
}
+/**
+ * ftrace_graph_ret_addr - convert a potentially modified stack return address
+ * to its original value
+ *
+ * This function can be called by stack unwinding code to convert a found stack
+ * return address ('ret') to its original value, in case the function graph
+ * tracer has modified it to be 'return_to_handler'. If the address hasn't
+ * been modified, the unchanged value of 'ret' is returned.
+ *
+ * 'idx' is a state variable which should be initialized by the caller to zero
+ * before the first call.
+ *
+ * 'retp' is a pointer to the return address on the stack. It's ignored if
+ * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined.
+ */
+#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
+ unsigned long ret, unsigned long *retp)
+{
+ int index = task->curr_ret_stack;
+ int i;
+
+ if (ret != (unsigned long)return_to_handler)
+ return ret;
+
+ if (index < -1)
+ index += FTRACE_NOTRACE_DEPTH;
+
+ if (index < 0)
+ return ret;
+
+ for (i = 0; i <= index; i++)
+ if (task->ret_stack[i].retp == retp)
+ return task->ret_stack[i].ret;
+
+ return ret;
+}
+#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
+unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
+ unsigned long ret, unsigned long *retp)
+{
+ int task_idx;
+
+ if (ret != (unsigned long)return_to_handler)
+ return ret;
+
+ task_idx = task->curr_ret_stack;
+
+ if (!task->ret_stack || task_idx < *idx)
+ return ret;
+
+ task_idx -= *idx;
+ (*idx)++;
+
+ return task->ret_stack[task_idx].ret;
+}
+#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
+
int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
unsigned long flags,
@@ -296,7 +358,7 @@ int __trace_graph_entry(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->graph_ent = *trace;
if (!call_filter_check_discard(call, entry, buffer, event))
- __buffer_unlock_commit(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
return 1;
}
@@ -322,10 +384,10 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
if (!ftrace_trace_task(tr))
return 0;
- /* trace it when it is-nested-in or is a function enabled. */
- if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
- ftrace_graph_ignore_irqs()) || (trace->depth < 0) ||
- (max_depth && trace->depth >= max_depth))
+ if (ftrace_graph_ignore_func(trace))
+ return 0;
+
+ if (ftrace_graph_ignore_irqs())
return 0;
/*
@@ -407,7 +469,7 @@ void __trace_graph_return(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->ret = *trace;
if (!call_filter_check_discard(call, entry, buffer, event))
- __buffer_unlock_commit(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
}
void trace_graph_return(struct ftrace_graph_ret *trace)
@@ -780,6 +842,10 @@ print_graph_entry_leaf(struct trace_iterator *iter,
cpu_data = per_cpu_ptr(data->cpu_data, cpu);
+ /* If a graph tracer ignored set_graph_notrace */
+ if (call->depth < -1)
+ call->depth += FTRACE_NOTRACE_DEPTH;
+
/*
* Comments display at + 1 to depth. Since
* this is a leaf function, keep the comments
@@ -788,7 +854,8 @@ print_graph_entry_leaf(struct trace_iterator *iter,
cpu_data->depth = call->depth - 1;
/* No need to keep this function around for this depth */
- if (call->depth < FTRACE_RETFUNC_DEPTH)
+ if (call->depth < FTRACE_RETFUNC_DEPTH &&
+ !WARN_ON_ONCE(call->depth < 0))
cpu_data->enter_funcs[call->depth] = 0;
}
@@ -818,11 +885,16 @@ print_graph_entry_nested(struct trace_iterator *iter,
struct fgraph_cpu_data *cpu_data;
int cpu = iter->cpu;
+ /* If a graph tracer ignored set_graph_notrace */
+ if (call->depth < -1)
+ call->depth += FTRACE_NOTRACE_DEPTH;
+
cpu_data = per_cpu_ptr(data->cpu_data, cpu);
cpu_data->depth = call->depth;
/* Save this function pointer to see if the exit matches */
- if (call->depth < FTRACE_RETFUNC_DEPTH)
+ if (call->depth < FTRACE_RETFUNC_DEPTH &&
+ !WARN_ON_ONCE(call->depth < 0))
cpu_data->enter_funcs[call->depth] = call->func;
}
@@ -1052,7 +1124,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
*/
cpu_data->depth = trace->depth - 1;
- if (trace->depth < FTRACE_RETFUNC_DEPTH) {
+ if (trace->depth < FTRACE_RETFUNC_DEPTH &&
+ !WARN_ON_ONCE(trace->depth < 0)) {
if (cpu_data->enter_funcs[trace->depth] != trace->func)
func_match = 0;
cpu_data->enter_funcs[trace->depth] = 0;
@@ -1120,6 +1193,11 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
trace_seq_puts(s, "/* ");
switch (iter->ent->type) {
+ case TRACE_BPUTS:
+ ret = trace_print_bputs_msg_only(iter);
+ if (ret != TRACE_TYPE_HANDLED)
+ return ret;
+ break;
case TRACE_BPRINT:
ret = trace_print_bprintk_msg_only(iter);
if (ret != TRACE_TYPE_HANDLED)
@@ -1422,7 +1500,7 @@ graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (ret)
return ret;
- max_depth = val;
+ fgraph_max_depth = val;
*ppos += cnt;
@@ -1436,7 +1514,7 @@ graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt,
char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/
int n;
- n = sprintf(buf, "%d\n", max_depth);
+ n = sprintf(buf, "%d\n", fgraph_max_depth);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
}
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
new file mode 100644
index 000000000000..af344a1bf0d0
--- /dev/null
+++ b/kernel/trace/trace_hwlat.c
@@ -0,0 +1,635 @@
+/*
+ * trace_hwlatdetect.c - A simple Hardware Latency detector.
+ *
+ * Use this tracer to detect large system latencies induced by the behavior of
+ * certain underlying system hardware or firmware, independent of Linux itself.
+ * The code was developed originally to detect the presence of SMIs on Intel
+ * and AMD systems, although there is no dependency upon x86 herein.
+ *
+ * The classical example usage of this tracer is in detecting the presence of
+ * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
+ * somewhat special form of hardware interrupt spawned from earlier CPU debug
+ * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
+ * LPC (or other device) to generate a special interrupt under certain
+ * circumstances, for example, upon expiration of a special SMI timer device,
+ * due to certain external thermal readings, on certain I/O address accesses,
+ * and other situations. An SMI hits a special CPU pin, triggers a special
+ * SMI mode (complete with special memory map), and the OS is unaware.
+ *
+ * Although certain hardware-inducing latencies are necessary (for example,
+ * a modern system often requires an SMI handler for correct thermal control
+ * and remote management) they can wreak havoc upon any OS-level performance
+ * guarantees toward low-latency, especially when the OS is not even made
+ * aware of the presence of these interrupts. For this reason, we need a
+ * somewhat brute force mechanism to detect these interrupts. In this case,
+ * we do it by hogging all of the CPU(s) for configurable timer intervals,
+ * sampling the built-in CPU timer, looking for discontiguous readings.
+ *
+ * WARNING: This implementation necessarily introduces latencies. Therefore,
+ * you should NEVER use this tracer while running in a production
+ * environment requiring any kind of low-latency performance
+ * guarantee(s).
+ *
+ * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
+ * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com>
+ *
+ * Includes useful feedback from Clark Williams <clark@redhat.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#include <linux/kthread.h>
+#include <linux/tracefs.h>
+#include <linux/uaccess.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include "trace.h"
+
+static struct trace_array *hwlat_trace;
+
+#define U64STR_SIZE 22 /* 20 digits max */
+
+#define BANNER "hwlat_detector: "
+#define DEFAULT_SAMPLE_WINDOW 1000000 /* 1s */
+#define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */
+#define DEFAULT_LAT_THRESHOLD 10 /* 10us */
+
+/* sampling thread*/
+static struct task_struct *hwlat_kthread;
+
+static struct dentry *hwlat_sample_width; /* sample width us */
+static struct dentry *hwlat_sample_window; /* sample window us */
+
+/* Save the previous tracing_thresh value */
+static unsigned long save_tracing_thresh;
+
+/* NMI timestamp counters */
+static u64 nmi_ts_start;
+static u64 nmi_total_ts;
+static int nmi_count;
+static int nmi_cpu;
+
+/* Tells NMIs to call back to the hwlat tracer to record timestamps */
+bool trace_hwlat_callback_enabled;
+
+/* If the user changed threshold, remember it */
+static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC;
+
+/* Individual latency samples are stored here when detected. */
+struct hwlat_sample {
+ u64 seqnum; /* unique sequence */
+ u64 duration; /* delta */
+ u64 outer_duration; /* delta (outer loop) */
+ u64 nmi_total_ts; /* Total time spent in NMIs */
+ struct timespec timestamp; /* wall time */
+ int nmi_count; /* # NMIs during this sample */
+};
+
+/* keep the global state somewhere. */
+static struct hwlat_data {
+
+ struct mutex lock; /* protect changes */
+
+ u64 count; /* total since reset */
+
+ u64 sample_window; /* total sampling window (on+off) */
+ u64 sample_width; /* active sampling portion of window */
+
+} hwlat_data = {
+ .sample_window = DEFAULT_SAMPLE_WINDOW,
+ .sample_width = DEFAULT_SAMPLE_WIDTH,
+};
+
+static void trace_hwlat_sample(struct hwlat_sample *sample)
+{
+ struct trace_array *tr = hwlat_trace;
+ struct trace_event_call *call = &event_hwlat;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct ring_buffer_event *event;
+ struct hwlat_entry *entry;
+ unsigned long flags;
+ int pc;
+
+ pc = preempt_count();
+ local_save_flags(flags);
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry),
+ flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->seqnum = sample->seqnum;
+ entry->duration = sample->duration;
+ entry->outer_duration = sample->outer_duration;
+ entry->timestamp = sample->timestamp;
+ entry->nmi_total_ts = sample->nmi_total_ts;
+ entry->nmi_count = sample->nmi_count;
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit_nostack(buffer, event);
+}
+
+/* Macros to encapsulate the time capturing infrastructure */
+#define time_type u64
+#define time_get() trace_clock_local()
+#define time_to_us(x) div_u64(x, 1000)
+#define time_sub(a, b) ((a) - (b))
+#define init_time(a, b) (a = b)
+#define time_u64(a) a
+
+void trace_hwlat_callback(bool enter)
+{
+ if (smp_processor_id() != nmi_cpu)
+ return;
+
+ /*
+ * Currently trace_clock_local() calls sched_clock() and the
+ * generic version is not NMI safe.
+ */
+ if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) {
+ if (enter)
+ nmi_ts_start = time_get();
+ else
+ nmi_total_ts = time_get() - nmi_ts_start;
+ }
+
+ if (enter)
+ nmi_count++;
+}
+
+/**
+ * get_sample - sample the CPU TSC and look for likely hardware latencies
+ *
+ * Used to repeatedly capture the CPU TSC (or similar), looking for potential
+ * hardware-induced latency. Called with interrupts disabled and with
+ * hwlat_data.lock held.
+ */
+static int get_sample(void)
+{
+ struct trace_array *tr = hwlat_trace;
+ time_type start, t1, t2, last_t2;
+ s64 diff, total, last_total = 0;
+ u64 sample = 0;
+ u64 thresh = tracing_thresh;
+ u64 outer_sample = 0;
+ int ret = -1;
+
+ do_div(thresh, NSEC_PER_USEC); /* modifies interval value */
+
+ nmi_cpu = smp_processor_id();
+ nmi_total_ts = 0;
+ nmi_count = 0;
+ /* Make sure NMIs see this first */
+ barrier();
+
+ trace_hwlat_callback_enabled = true;
+
+ init_time(last_t2, 0);
+ start = time_get(); /* start timestamp */
+
+ do {
+
+ t1 = time_get(); /* we'll look for a discontinuity */
+ t2 = time_get();
+
+ if (time_u64(last_t2)) {
+ /* Check the delta from outer loop (t2 to next t1) */
+ diff = time_to_us(time_sub(t1, last_t2));
+ /* This shouldn't happen */
+ if (diff < 0) {
+ pr_err(BANNER "time running backwards\n");
+ goto out;
+ }
+ if (diff > outer_sample)
+ outer_sample = diff;
+ }
+ last_t2 = t2;
+
+ total = time_to_us(time_sub(t2, start)); /* sample width */
+
+ /* Check for possible overflows */
+ if (total < last_total) {
+ pr_err("Time total overflowed\n");
+ break;
+ }
+ last_total = total;
+
+ /* This checks the inner loop (t1 to t2) */
+ diff = time_to_us(time_sub(t2, t1)); /* current diff */
+
+ /* This shouldn't happen */
+ if (diff < 0) {
+ pr_err(BANNER "time running backwards\n");
+ goto out;
+ }
+
+ if (diff > sample)
+ sample = diff; /* only want highest value */
+
+ } while (total <= hwlat_data.sample_width);
+
+ barrier(); /* finish the above in the view for NMIs */
+ trace_hwlat_callback_enabled = false;
+ barrier(); /* Make sure nmi_total_ts is no longer updated */
+
+ ret = 0;
+
+ /* If we exceed the threshold value, we have found a hardware latency */
+ if (sample > thresh || outer_sample > thresh) {
+ struct hwlat_sample s;
+
+ ret = 1;
+
+ /* We read in microseconds */
+ if (nmi_total_ts)
+ do_div(nmi_total_ts, NSEC_PER_USEC);
+
+ hwlat_data.count++;
+ s.seqnum = hwlat_data.count;
+ s.duration = sample;
+ s.outer_duration = outer_sample;
+ s.timestamp = CURRENT_TIME;
+ s.nmi_total_ts = nmi_total_ts;
+ s.nmi_count = nmi_count;
+ trace_hwlat_sample(&s);
+
+ /* Keep a running maximum ever recorded hardware latency */
+ if (sample > tr->max_latency)
+ tr->max_latency = sample;
+ }
+
+out:
+ return ret;
+}
+
+static struct cpumask save_cpumask;
+static bool disable_migrate;
+
+static void move_to_next_cpu(bool initmask)
+{
+ static struct cpumask *current_mask;
+ int next_cpu;
+
+ if (disable_migrate)
+ return;
+
+ /* Just pick the first CPU on first iteration */
+ if (initmask) {
+ current_mask = &save_cpumask;
+ get_online_cpus();
+ cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+ put_online_cpus();
+ next_cpu = cpumask_first(current_mask);
+ goto set_affinity;
+ }
+
+ /*
+ * If for some reason the user modifies the CPU affinity
+ * of this thread, than stop migrating for the duration
+ * of the current test.
+ */
+ if (!cpumask_equal(current_mask, &current->cpus_allowed))
+ goto disable;
+
+ get_online_cpus();
+ cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+ next_cpu = cpumask_next(smp_processor_id(), current_mask);
+ put_online_cpus();
+
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(current_mask);
+
+ set_affinity:
+ if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
+ goto disable;
+
+ cpumask_clear(current_mask);
+ cpumask_set_cpu(next_cpu, current_mask);
+
+ sched_setaffinity(0, current_mask);
+ return;
+
+ disable:
+ disable_migrate = true;
+}
+
+/*
+ * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
+ *
+ * Used to periodically sample the CPU TSC via a call to get_sample. We
+ * disable interrupts, which does (intentionally) introduce latency since we
+ * need to ensure nothing else might be running (and thus preempting).
+ * Obviously this should never be used in production environments.
+ *
+ * Currently this runs on which ever CPU it was scheduled on, but most
+ * real-world hardware latency situations occur across several CPUs,
+ * but we might later generalize this if we find there are any actualy
+ * systems with alternate SMI delivery or other hardware latencies.
+ */
+static int kthread_fn(void *data)
+{
+ u64 interval;
+ bool initmask = true;
+
+ while (!kthread_should_stop()) {
+
+ move_to_next_cpu(initmask);
+ initmask = false;
+
+ local_irq_disable();
+ get_sample();
+ local_irq_enable();
+
+ mutex_lock(&hwlat_data.lock);
+ interval = hwlat_data.sample_window - hwlat_data.sample_width;
+ mutex_unlock(&hwlat_data.lock);
+
+ do_div(interval, USEC_PER_MSEC); /* modifies interval value */
+
+ /* Always sleep for at least 1ms */
+ if (interval < 1)
+ interval = 1;
+
+ if (msleep_interruptible(interval))
+ break;
+ }
+
+ return 0;
+}
+
+/**
+ * start_kthread - Kick off the hardware latency sampling/detector kthread
+ *
+ * This starts the kernel thread that will sit and sample the CPU timestamp
+ * counter (TSC or similar) and look for potential hardware latencies.
+ */
+static int start_kthread(struct trace_array *tr)
+{
+ struct task_struct *kthread;
+
+ kthread = kthread_create(kthread_fn, NULL, "hwlatd");
+ if (IS_ERR(kthread)) {
+ pr_err(BANNER "could not start sampling thread\n");
+ return -ENOMEM;
+ }
+ hwlat_kthread = kthread;
+ wake_up_process(kthread);
+
+ return 0;
+}
+
+/**
+ * stop_kthread - Inform the hardware latency samping/detector kthread to stop
+ *
+ * This kicks the running hardware latency sampling/detector kernel thread and
+ * tells it to stop sampling now. Use this on unload and at system shutdown.
+ */
+static void stop_kthread(void)
+{
+ if (!hwlat_kthread)
+ return;
+ kthread_stop(hwlat_kthread);
+ hwlat_kthread = NULL;
+}
+
+/*
+ * hwlat_read - Wrapper read function for reading both window and width
+ * @filp: The active open file structure
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a generic read implementation for the global state
+ * "hwlat_data" structure filesystem entries.
+ */
+static ssize_t hwlat_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[U64STR_SIZE];
+ u64 *entry = filp->private_data;
+ u64 val;
+ int len;
+
+ if (!entry)
+ return -EFAULT;
+
+ if (cnt > sizeof(buf))
+ cnt = sizeof(buf);
+
+ val = *entry;
+
+ len = snprintf(buf, sizeof(buf), "%llu\n", val);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
+}
+
+/**
+ * hwlat_width_write - Write function for "width" entry
+ * @filp: The active open file structure
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in @file
+ *
+ * This function provides a write implementation for the "width" interface
+ * to the hardware latency detector. It can be used to configure
+ * for how many us of the total window us we will actively sample for any
+ * hardware-induced latency periods. Obviously, it is not possible to
+ * sample constantly and have the system respond to a sample reader, or,
+ * worse, without having the system appear to have gone out to lunch. It
+ * is enforced that width is less that the total window size.
+ */
+static ssize_t
+hwlat_width_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ u64 val;
+ int err;
+
+ err = kstrtoull_from_user(ubuf, cnt, 10, &val);
+ if (err)
+ return err;
+
+ mutex_lock(&hwlat_data.lock);
+ if (val < hwlat_data.sample_window)
+ hwlat_data.sample_width = val;
+ else
+ err = -EINVAL;
+ mutex_unlock(&hwlat_data.lock);
+
+ if (err)
+ return err;
+
+ return cnt;
+}
+
+/**
+ * hwlat_window_write - Write function for "window" entry
+ * @filp: The active open file structure
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in @file
+ *
+ * This function provides a write implementation for the "window" interface
+ * to the hardware latency detetector. The window is the total time
+ * in us that will be considered one sample period. Conceptually, windows
+ * occur back-to-back and contain a sample width period during which
+ * actual sampling occurs. Can be used to write a new total window size. It
+ * is enfoced that any value written must be greater than the sample width
+ * size, or an error results.
+ */
+static ssize_t
+hwlat_window_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ u64 val;
+ int err;
+
+ err = kstrtoull_from_user(ubuf, cnt, 10, &val);
+ if (err)
+ return err;
+
+ mutex_lock(&hwlat_data.lock);
+ if (hwlat_data.sample_width < val)
+ hwlat_data.sample_window = val;
+ else
+ err = -EINVAL;
+ mutex_unlock(&hwlat_data.lock);
+
+ if (err)
+ return err;
+
+ return cnt;
+}
+
+static const struct file_operations width_fops = {
+ .open = tracing_open_generic,
+ .read = hwlat_read,
+ .write = hwlat_width_write,
+};
+
+static const struct file_operations window_fops = {
+ .open = tracing_open_generic,
+ .read = hwlat_read,
+ .write = hwlat_window_write,
+};
+
+/**
+ * init_tracefs - A function to initialize the tracefs interface files
+ *
+ * This function creates entries in tracefs for "hwlat_detector".
+ * It creates the hwlat_detector directory in the tracing directory,
+ * and within that directory is the count, width and window files to
+ * change and view those values.
+ */
+static int init_tracefs(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *top_dir;
+
+ d_tracer = tracing_init_dentry();
+ if (IS_ERR(d_tracer))
+ return -ENOMEM;
+
+ top_dir = tracefs_create_dir("hwlat_detector", d_tracer);
+ if (!top_dir)
+ return -ENOMEM;
+
+ hwlat_sample_window = tracefs_create_file("window", 0640,
+ top_dir,
+ &hwlat_data.sample_window,
+ &window_fops);
+ if (!hwlat_sample_window)
+ goto err;
+
+ hwlat_sample_width = tracefs_create_file("width", 0644,
+ top_dir,
+ &hwlat_data.sample_width,
+ &width_fops);
+ if (!hwlat_sample_width)
+ goto err;
+
+ return 0;
+
+ err:
+ tracefs_remove_recursive(top_dir);
+ return -ENOMEM;
+}
+
+static void hwlat_tracer_start(struct trace_array *tr)
+{
+ int err;
+
+ err = start_kthread(tr);
+ if (err)
+ pr_err(BANNER "Cannot start hwlat kthread\n");
+}
+
+static void hwlat_tracer_stop(struct trace_array *tr)
+{
+ stop_kthread();
+}
+
+static bool hwlat_busy;
+
+static int hwlat_tracer_init(struct trace_array *tr)
+{
+ /* Only allow one instance to enable this */
+ if (hwlat_busy)
+ return -EBUSY;
+
+ hwlat_trace = tr;
+
+ disable_migrate = false;
+ hwlat_data.count = 0;
+ tr->max_latency = 0;
+ save_tracing_thresh = tracing_thresh;
+
+ /* tracing_thresh is in nsecs, we speak in usecs */
+ if (!tracing_thresh)
+ tracing_thresh = last_tracing_thresh;
+
+ if (tracer_tracing_is_on(tr))
+ hwlat_tracer_start(tr);
+
+ hwlat_busy = true;
+
+ return 0;
+}
+
+static void hwlat_tracer_reset(struct trace_array *tr)
+{
+ stop_kthread();
+
+ /* the tracing threshold is static between runs */
+ last_tracing_thresh = tracing_thresh;
+
+ tracing_thresh = save_tracing_thresh;
+ hwlat_busy = false;
+}
+
+static struct tracer hwlat_tracer __read_mostly =
+{
+ .name = "hwlat",
+ .init = hwlat_tracer_init,
+ .reset = hwlat_tracer_reset,
+ .start = hwlat_tracer_start,
+ .stop = hwlat_tracer_stop,
+ .allow_instances = true,
+};
+
+__init static int init_hwlat_tracer(void)
+{
+ int ret;
+
+ mutex_init(&hwlat_data.lock);
+
+ ret = register_tracer(&hwlat_tracer);
+ if (ret)
+ return ret;
+
+ init_tracefs();
+
+ return 0;
+}
+late_initcall(init_hwlat_tracer);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 03cdff84d026..7758bc0617cb 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -175,6 +175,18 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
int ret;
int pc;
+ if (ftrace_graph_ignore_func(trace))
+ return 0;
+ /*
+ * Do not trace a function if it's filtered by set_graph_notrace.
+ * Make the index of ret stack negative to indicate that it should
+ * ignore further functions. But it needs its own ret stack entry
+ * to recover the original index in order to continue tracing after
+ * returning from the function.
+ */
+ if (ftrace_graph_notrace_addr(trace->func))
+ return 1;
+
if (!func_prolog_dec(tr, &data, &flags))
return 0;
@@ -286,7 +298,7 @@ static void irqsoff_print_header(struct seq_file *s)
/*
* Should this new latency be reported/recorded?
*/
-static bool report_latency(struct trace_array *tr, cycle_t delta)
+static bool report_latency(struct trace_array *tr, u64 delta)
{
if (tracing_thresh) {
if (delta < tracing_thresh)
@@ -304,7 +316,7 @@ check_critical_timing(struct trace_array *tr,
unsigned long parent_ip,
int cpu)
{
- cycle_t T0, T1, delta;
+ u64 T0, T1, delta;
unsigned long flags;
int pc;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9aedb0b06683..7ad9e53ad174 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -73,6 +73,17 @@ static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
return !!strchr(trace_kprobe_symbol(tk), ':');
}
+static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
+{
+ unsigned long nhit = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ nhit += *per_cpu_ptr(tk->nhit, cpu);
+
+ return nhit;
+}
+
static int register_kprobe_event(struct trace_kprobe *tk);
static int unregister_kprobe_event(struct trace_kprobe *tk);
@@ -253,6 +264,10 @@ static const struct fetch_type kprobes_fetch_type_table[] = {
ASSIGN_FETCH_TYPE(s16, u16, 1),
ASSIGN_FETCH_TYPE(s32, u32, 1),
ASSIGN_FETCH_TYPE(s64, u64, 1),
+ ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
ASSIGN_FETCH_TYPE_END
};
@@ -878,14 +893,10 @@ static const struct file_operations kprobe_events_ops = {
static int probes_profile_seq_show(struct seq_file *m, void *v)
{
struct trace_kprobe *tk = v;
- unsigned long nhit = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- nhit += *per_cpu_ptr(tk->nhit, cpu);
seq_printf(m, " %-44s %15lu %15lu\n",
- trace_event_name(&tk->tp.call), nhit,
+ trace_event_name(&tk->tp.call),
+ trace_kprobe_nhit(tk),
tk->rp.kp.nmissed);
return 0;
@@ -1350,18 +1361,18 @@ fs_initcall(init_kprobe_trace);
#ifdef CONFIG_FTRACE_STARTUP_TEST
-
/*
* The "__used" keeps gcc from removing the function symbol
- * from the kallsyms table.
+ * from the kallsyms table. 'noinline' makes sure that there
+ * isn't an inlined version used by the test method below
*/
-static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
- int a4, int a5, int a6)
+static __used __init noinline int
+kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6)
{
return a1 + a2 + a3 + a4 + a5 + a6;
}
-static struct trace_event_file *
+static __init struct trace_event_file *
find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr)
{
struct trace_event_file *file;
@@ -1439,12 +1450,25 @@ static __init int kprobe_trace_self_tests_init(void)
ret = target(1, 2, 3, 4, 5, 6);
+ /*
+ * Not expecting an error here, the check is only to prevent the
+ * optimizer from removing the call to target() as otherwise there
+ * are no side-effects and the call is never performed.
+ */
+ if (ret != 21)
+ warn++;
+
/* Disable trace points before removing it */
tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
if (WARN_ON_ONCE(tk == NULL)) {
pr_warn("error on getting test probe.\n");
warn++;
} else {
+ if (trace_kprobe_nhit(tk) != 1) {
+ pr_warn("incorrect number of testprobe hits\n");
+ warn++;
+ }
+
file = find_trace_probe_file(tk, top_trace_array());
if (WARN_ON_ONCE(file == NULL)) {
pr_warn("error on getting probe file.\n");
@@ -1458,6 +1482,11 @@ static __init int kprobe_trace_self_tests_init(void)
pr_warn("error on getting 2nd test probe.\n");
warn++;
} else {
+ if (trace_kprobe_nhit(tk) != 1) {
+ pr_warn("incorrect number of testprobe2 hits\n");
+ warn++;
+ }
+
file = find_trace_probe_file(tk, top_trace_array());
if (WARN_ON_ONCE(file == NULL)) {
pr_warn("error on getting probe file.\n");
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0bb9cf2d53e6..5d33a7352919 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1098,6 +1098,71 @@ static struct trace_event trace_user_stack_event = {
.funcs = &trace_user_stack_funcs,
};
+/* TRACE_HWLAT */
+static enum print_line_t
+trace_hwlat_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_entry *entry = iter->ent;
+ struct trace_seq *s = &iter->seq;
+ struct hwlat_entry *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld",
+ field->seqnum,
+ field->duration,
+ field->outer_duration,
+ field->timestamp.tv_sec,
+ field->timestamp.tv_nsec);
+
+ if (field->nmi_count) {
+ /*
+ * The generic sched_clock() is not NMI safe, thus
+ * we only record the count and not the time.
+ */
+ if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK))
+ trace_seq_printf(s, " nmi-total:%llu",
+ field->nmi_total_ts);
+ trace_seq_printf(s, " nmi-count:%u",
+ field->nmi_count);
+ }
+
+ trace_seq_putc(s, '\n');
+
+ return trace_handle_return(s);
+}
+
+
+static enum print_line_t
+trace_hwlat_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct hwlat_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(s, "%llu %lld %ld %09ld %u\n",
+ field->duration,
+ field->outer_duration,
+ field->timestamp.tv_sec,
+ field->timestamp.tv_nsec,
+ field->seqnum);
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_hwlat_funcs = {
+ .trace = trace_hwlat_print,
+ .raw = trace_hwlat_raw,
+};
+
+static struct trace_event trace_hwlat_event = {
+ .type = TRACE_HWLAT,
+ .funcs = &trace_hwlat_funcs,
+};
+
/* TRACE_BPUTS */
static enum print_line_t
trace_bputs_print(struct trace_iterator *iter, int flags,
@@ -1223,6 +1288,35 @@ static struct trace_event trace_print_event = {
.funcs = &trace_print_funcs,
};
+static enum print_line_t trace_raw_data(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct raw_data_entry *field;
+ int i;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(&iter->seq, "# %x buf:", field->id);
+
+ for (i = 0; i < iter->ent_size - offsetof(struct raw_data_entry, buf); i++)
+ trace_seq_printf(&iter->seq, " %02x",
+ (unsigned char)field->buf[i]);
+
+ trace_seq_putc(&iter->seq, '\n');
+
+ return trace_handle_return(&iter->seq);
+}
+
+static struct trace_event_functions trace_raw_data_funcs = {
+ .trace = trace_raw_data,
+ .raw = trace_raw_data,
+};
+
+static struct trace_event trace_raw_data_event = {
+ .type = TRACE_RAW_DATA,
+ .funcs = &trace_raw_data_funcs,
+};
+
static struct trace_event *events[] __initdata = {
&trace_fn_event,
@@ -1233,6 +1327,8 @@ static struct trace_event *events[] __initdata = {
&trace_bputs_event,
&trace_bprint_event,
&trace_print_event,
+ &trace_hwlat_event,
+ &trace_raw_data_event,
NULL
};
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 74e80a582c28..8c0553d9afd3 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -36,24 +36,28 @@ const char *reserved_field_names[] = {
};
/* Printing in basic type function template */
-#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \
-int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
+#define DEFINE_BASIC_PRINT_TYPE_FUNC(tname, type, fmt) \
+int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, const char *name, \
void *data, void *ent) \
{ \
trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
return !trace_seq_has_overflowed(s); \
} \
-const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \
-NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
-
-DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x")
-DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x")
-DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x")
-DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx")
-DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d")
-DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d")
-DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d")
-DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld")
+const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; \
+NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(tname));
+
+DEFINE_BASIC_PRINT_TYPE_FUNC(u8, u8, "%u")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "%u")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u32, u32, "%u")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u64, u64, "%Lu")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s8, s8, "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s16, s16, "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s32, s32, "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s64, s64, "%Ld")
+DEFINE_BASIC_PRINT_TYPE_FUNC(x8, u8, "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx")
/* Print type function for string type */
int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 45400ca5ded1..0c0ae54d44c6 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -149,6 +149,11 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(s8);
DECLARE_BASIC_PRINT_TYPE_FUNC(s16);
DECLARE_BASIC_PRINT_TYPE_FUNC(s32);
DECLARE_BASIC_PRINT_TYPE_FUNC(s64);
+DECLARE_BASIC_PRINT_TYPE_FUNC(x8);
+DECLARE_BASIC_PRINT_TYPE_FUNC(x16);
+DECLARE_BASIC_PRINT_TYPE_FUNC(x32);
+DECLARE_BASIC_PRINT_TYPE_FUNC(x64);
+
DECLARE_BASIC_PRINT_TYPE_FUNC(string);
#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
@@ -203,7 +208,7 @@ DEFINE_FETCH_##method(u32) \
DEFINE_FETCH_##method(u64)
/* Default (unsigned long) fetch type */
-#define __DEFAULT_FETCH_TYPE(t) u##t
+#define __DEFAULT_FETCH_TYPE(t) x##t
#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
@@ -234,6 +239,10 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \
#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
__ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
+/* If ptype is an alias of atype, use this macro (show atype in format) */
+#define ASSIGN_FETCH_TYPE_ALIAS(ptype, atype, ftype, sign) \
+ __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #atype)
+
#define ASSIGN_FETCH_TYPE_END {}
#define FETCH_TYPE_STRING 0
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9d4399b553a3..ddec53b67646 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -239,6 +239,18 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
unsigned long flags;
int pc, ret = 0;
+ if (ftrace_graph_ignore_func(trace))
+ return 0;
+ /*
+ * Do not trace a function if it's filtered by set_graph_notrace.
+ * Make the index of ret stack negative to indicate that it should
+ * ignore further functions. But it needs its own ret stack entry
+ * to recover the original index in order to continue tracing after
+ * returning from the function.
+ */
+ if (ftrace_graph_notrace_addr(trace->func))
+ return 1;
+
if (!func_prolog_preempt_disable(tr, &data, &pc))
return 0;
@@ -346,7 +358,7 @@ static void wakeup_print_header(struct seq_file *s)
/*
* Should this new latency be reported/recorded?
*/
-static bool report_latency(struct trace_array *tr, cycle_t delta)
+static bool report_latency(struct trace_array *tr, u64 delta)
{
if (tracing_thresh) {
if (delta < tracing_thresh)
@@ -428,7 +440,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
struct task_struct *prev, struct task_struct *next)
{
struct trace_array_cpu *data;
- cycle_t T0, T1, delta;
+ u64 T0, T1, delta;
unsigned long flags;
long disabled;
int cpu;
@@ -790,6 +802,7 @@ static struct tracer wakeup_dl_tracer __read_mostly =
#endif
.open = wakeup_trace_open,
.close = wakeup_trace_close,
+ .allow_instances = true,
.use_max_tr = true,
};
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index b2b6efc083a4..5e10395da88e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -610,8 +610,7 @@ static int perf_sysenter_enable(struct trace_event_call *call)
if (!sys_perf_refcount_enter)
ret = register_trace_sys_enter(perf_syscall_enter, NULL);
if (ret) {
- pr_info("event trace: Could not activate"
- "syscall entry trace point");
+ pr_info("event trace: Could not activate syscall entry trace point");
} else {
set_bit(num, enabled_perf_enter_syscalls);
sys_perf_refcount_enter++;
@@ -682,8 +681,7 @@ static int perf_sysexit_enable(struct trace_event_call *call)
if (!sys_perf_refcount_exit)
ret = register_trace_sys_exit(perf_syscall_exit, NULL);
if (ret) {
- pr_info("event trace: Could not activate"
- "syscall exit trace point");
+ pr_info("event trace: Could not activate syscall exit trace point");
} else {
set_bit(num, enabled_perf_exit_syscalls);
sys_perf_refcount_exit++;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c53485441c88..0913693caf6e 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -211,6 +211,10 @@ static const struct fetch_type uprobes_fetch_type_table[] = {
ASSIGN_FETCH_TYPE(s16, u16, 1),
ASSIGN_FETCH_TYPE(s32, u32, 1),
ASSIGN_FETCH_TYPE(s64, u64, 1),
+ ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
ASSIGN_FETCH_TYPE_END
};
@@ -427,10 +431,6 @@ static int create_trace_uprobe(int argc, char **argv)
pr_info("Probe point is not specified.\n");
return -EINVAL;
}
- if (isdigit(argv[1][0])) {
- pr_info("probe point must be have a filename.\n");
- return -EINVAL;
- }
arg = strchr(argv[1], ':');
if (!arg) {
ret = -EINVAL;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index d0639d917899..1f9a31f934a4 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -194,9 +194,13 @@ static int tracepoint_add_func(struct tracepoint *tp,
struct tracepoint_func *func, int prio)
{
struct tracepoint_func *old, *tp_funcs;
+ int ret;
- if (tp->regfunc && !static_key_enabled(&tp->key))
- tp->regfunc();
+ if (tp->regfunc && !static_key_enabled(&tp->key)) {
+ ret = tp->regfunc();
+ if (ret < 0)
+ return ret;
+ }
tp_funcs = rcu_dereference_protected(tp->funcs,
lockdep_is_held(&tracepoints_mutex));
@@ -529,7 +533,7 @@ EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint);
/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
static int sys_tracepoint_refcount;
-void syscall_regfunc(void)
+int syscall_regfunc(void)
{
struct task_struct *p, *t;
@@ -541,6 +545,8 @@ void syscall_regfunc(void)
read_unlock(&tasklist_lock);
}
sys_tracepoint_refcount++;
+
+ return 0;
}
void syscall_unregfunc(void)
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index f8e26ab963ed..5c21f0535056 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -31,7 +31,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,
struct taskstats *stats, struct task_struct *tsk)
{
const struct cred *tcred;
- cputime_t utime, stime, utimescaled, stimescaled;
+ u64 utime, stime, utimescaled, stimescaled;
u64 delta;
BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
@@ -67,12 +67,12 @@ void bacct_add_tsk(struct user_namespace *user_ns,
rcu_read_unlock();
task_cputime(tsk, &utime, &stime);
- stats->ac_utime = cputime_to_usecs(utime);
- stats->ac_stime = cputime_to_usecs(stime);
+ stats->ac_utime = div_u64(utime, NSEC_PER_USEC);
+ stats->ac_stime = div_u64(stime, NSEC_PER_USEC);
task_cputime_scaled(tsk, &utimescaled, &stimescaled);
- stats->ac_utimescaled = cputime_to_usecs(utimescaled);
- stats->ac_stimescaled = cputime_to_usecs(stimescaled);
+ stats->ac_utimescaled = div_u64(utimescaled, NSEC_PER_USEC);
+ stats->ac_stimescaled = div_u64(stimescaled, NSEC_PER_USEC);
stats->ac_minflt = tsk->min_flt;
stats->ac_majflt = tsk->maj_flt;
@@ -123,18 +123,15 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
#undef MB
static void __acct_update_integrals(struct task_struct *tsk,
- cputime_t utime, cputime_t stime)
+ u64 utime, u64 stime)
{
- cputime_t time, dtime;
- u64 delta;
+ u64 time, delta;
if (!likely(tsk->mm))
return;
time = stime + utime;
- dtime = time - tsk->acct_timexpd;
- /* Avoid division: cputime_t is often in nanoseconds already. */
- delta = cputime_to_nsecs(dtime);
+ delta = time - tsk->acct_timexpd;
if (delta < TICK_NSEC)
return;
@@ -155,7 +152,7 @@ static void __acct_update_integrals(struct task_struct *tsk,
*/
void acct_update_integrals(struct task_struct *tsk)
{
- cputime_t utime, stime;
+ u64 utime, stime;
unsigned long flags;
local_irq_save(flags);
diff --git a/kernel/ucount.c b/kernel/ucount.c
new file mode 100644
index 000000000000..95c6336fc2b3
--- /dev/null
+++ b/kernel/ucount.c
@@ -0,0 +1,236 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/user_namespace.h>
+
+#define UCOUNTS_HASHTABLE_BITS 10
+static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
+static DEFINE_SPINLOCK(ucounts_lock);
+
+#define ucounts_hashfn(ns, uid) \
+ hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \
+ UCOUNTS_HASHTABLE_BITS)
+#define ucounts_hashentry(ns, uid) \
+ (ucounts_hashtable + ucounts_hashfn(ns, uid))
+
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_set *
+set_lookup(struct ctl_table_root *root)
+{
+ return &current_user_ns()->set;
+}
+
+static int set_is_seen(struct ctl_table_set *set)
+{
+ return &current_user_ns()->set == set;
+}
+
+static int set_permissions(struct ctl_table_header *head,
+ struct ctl_table *table)
+{
+ struct user_namespace *user_ns =
+ container_of(head->set, struct user_namespace, set);
+ int mode;
+
+ /* Allow users with CAP_SYS_RESOURCE unrestrained access */
+ if (ns_capable(user_ns, CAP_SYS_RESOURCE))
+ mode = (table->mode & S_IRWXU) >> 6;
+ else
+ /* Allow all others at most read-only access */
+ mode = table->mode & S_IROTH;
+ return (mode << 6) | (mode << 3) | mode;
+}
+
+static struct ctl_table_root set_root = {
+ .lookup = set_lookup,
+ .permissions = set_permissions,
+};
+
+static int zero = 0;
+static int int_max = INT_MAX;
+#define UCOUNT_ENTRY(name) \
+ { \
+ .procname = name, \
+ .maxlen = sizeof(int), \
+ .mode = 0644, \
+ .proc_handler = proc_dointvec_minmax, \
+ .extra1 = &zero, \
+ .extra2 = &int_max, \
+ }
+static struct ctl_table user_table[] = {
+ UCOUNT_ENTRY("max_user_namespaces"),
+ UCOUNT_ENTRY("max_pid_namespaces"),
+ UCOUNT_ENTRY("max_uts_namespaces"),
+ UCOUNT_ENTRY("max_ipc_namespaces"),
+ UCOUNT_ENTRY("max_net_namespaces"),
+ UCOUNT_ENTRY("max_mnt_namespaces"),
+ UCOUNT_ENTRY("max_cgroup_namespaces"),
+ { }
+};
+#endif /* CONFIG_SYSCTL */
+
+bool setup_userns_sysctls(struct user_namespace *ns)
+{
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl;
+ setup_sysctl_set(&ns->set, &set_root, set_is_seen);
+ tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL);
+ if (tbl) {
+ int i;
+ for (i = 0; i < UCOUNT_COUNTS; i++) {
+ tbl[i].data = &ns->ucount_max[i];
+ }
+ ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl);
+ }
+ if (!ns->sysctls) {
+ kfree(tbl);
+ retire_sysctl_set(&ns->set);
+ return false;
+ }
+#endif
+ return true;
+}
+
+void retire_userns_sysctls(struct user_namespace *ns)
+{
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl;
+
+ tbl = ns->sysctls->ctl_table_arg;
+ unregister_sysctl_table(ns->sysctls);
+ retire_sysctl_set(&ns->set);
+ kfree(tbl);
+#endif
+}
+
+static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent)
+{
+ struct ucounts *ucounts;
+
+ hlist_for_each_entry(ucounts, hashent, node) {
+ if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns))
+ return ucounts;
+ }
+ return NULL;
+}
+
+static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
+{
+ struct hlist_head *hashent = ucounts_hashentry(ns, uid);
+ struct ucounts *ucounts, *new;
+
+ spin_lock_irq(&ucounts_lock);
+ ucounts = find_ucounts(ns, uid, hashent);
+ if (!ucounts) {
+ spin_unlock_irq(&ucounts_lock);
+
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ new->ns = ns;
+ new->uid = uid;
+ atomic_set(&new->count, 0);
+
+ spin_lock_irq(&ucounts_lock);
+ ucounts = find_ucounts(ns, uid, hashent);
+ if (ucounts) {
+ kfree(new);
+ } else {
+ hlist_add_head(&new->node, hashent);
+ ucounts = new;
+ }
+ }
+ if (!atomic_add_unless(&ucounts->count, 1, INT_MAX))
+ ucounts = NULL;
+ spin_unlock_irq(&ucounts_lock);
+ return ucounts;
+}
+
+static void put_ucounts(struct ucounts *ucounts)
+{
+ unsigned long flags;
+
+ if (atomic_dec_and_test(&ucounts->count)) {
+ spin_lock_irqsave(&ucounts_lock, flags);
+ hlist_del_init(&ucounts->node);
+ spin_unlock_irqrestore(&ucounts_lock, flags);
+
+ kfree(ucounts);
+ }
+}
+
+static inline bool atomic_inc_below(atomic_t *v, int u)
+{
+ int c, old;
+ c = atomic_read(v);
+ for (;;) {
+ if (unlikely(c >= u))
+ return false;
+ old = atomic_cmpxchg(v, c, c+1);
+ if (likely(old == c))
+ return true;
+ c = old;
+ }
+}
+
+struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
+ enum ucount_type type)
+{
+ struct ucounts *ucounts, *iter, *bad;
+ struct user_namespace *tns;
+ ucounts = get_ucounts(ns, uid);
+ for (iter = ucounts; iter; iter = tns->ucounts) {
+ int max;
+ tns = iter->ns;
+ max = READ_ONCE(tns->ucount_max[type]);
+ if (!atomic_inc_below(&iter->ucount[type], max))
+ goto fail;
+ }
+ return ucounts;
+fail:
+ bad = iter;
+ for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
+ atomic_dec(&iter->ucount[type]);
+
+ put_ucounts(ucounts);
+ return NULL;
+}
+
+void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
+{
+ struct ucounts *iter;
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ int dec = atomic_dec_if_positive(&iter->ucount[type]);
+ WARN_ON_ONCE(dec < 0);
+ }
+ put_ucounts(ucounts);
+}
+
+static __init int user_namespace_sysctl_init(void)
+{
+#ifdef CONFIG_SYSCTL
+ static struct ctl_table_header *user_header;
+ static struct ctl_table empty[1];
+ /*
+ * It is necessary to register the user directory in the
+ * default set so that registrations in the child sets work
+ * properly.
+ */
+ user_header = register_sysctl("user", empty);
+ kmemleak_ignore(user_header);
+ BUG_ON(!user_header);
+ BUG_ON(!setup_userns_sysctls(&init_user_ns));
+#endif
+ return 0;
+}
+subsys_initcall(user_namespace_sysctl_init);
diff --git a/kernel/uid16.c b/kernel/uid16.c
index d58cc4d8f0d1..71645ae9303a 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -14,7 +14,7 @@
#include <linux/security.h>
#include <linux/syscalls.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
{
@@ -117,7 +117,7 @@ static int groups16_to_user(old_gid_t __user *grouplist,
kgid_t kgid;
for (i = 0; i < group_info->ngroups; i++) {
- kgid = GROUP_AT(group_info, i);
+ kgid = group_info->gid[i];
group = high2lowgid(from_kgid_munged(user_ns, kgid));
if (put_user(group, grouplist+i))
return -EFAULT;
@@ -142,7 +142,7 @@ static int groups16_from_user(struct group_info *group_info,
if (!gid_valid(kgid))
return -EINVAL;
- GROUP_AT(group_info, i) = kgid;
+ group_info->gid[i] = kgid;
}
return 0;
diff --git a/kernel/up.c b/kernel/up.c
index 1760bf3d1463..ee81ac9af4ca 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -6,6 +6,7 @@
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/smp.h>
+#include <linux/hypervisor.h>
int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
int wait)
@@ -82,3 +83,20 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
preempt_enable();
}
EXPORT_SYMBOL(on_each_cpu_cond);
+
+int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
+{
+ int ret;
+
+ if (cpu != 0)
+ return -ENXIO;
+
+ if (phys)
+ hypervisor_pin_vcpu(0);
+ ret = func(par);
+ if (phys)
+ hypervisor_pin_vcpu(-1);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(smp_call_on_cpu);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 68f594212759..86b7854fec8e 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -29,6 +29,17 @@ static DEFINE_MUTEX(userns_state_mutex);
static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
struct uid_gid_map *map);
+static void free_user_ns(struct work_struct *work);
+
+static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid)
+{
+ return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES);
+}
+
+static void dec_user_namespaces(struct ucounts *ucounts)
+{
+ return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES);
+}
static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
{
@@ -62,10 +73,16 @@ int create_user_ns(struct cred *new)
struct user_namespace *ns, *parent_ns = new->user_ns;
kuid_t owner = new->euid;
kgid_t group = new->egid;
- int ret;
+ struct ucounts *ucounts;
+ int ret, i;
+ ret = -ENOSPC;
if (parent_ns->level > 32)
- return -EUSERS;
+ goto fail;
+
+ ucounts = inc_user_namespaces(parent_ns, owner);
+ if (!ucounts)
+ goto fail;
/*
* Verify that we can not violate the policy of which files
@@ -73,26 +90,27 @@ int create_user_ns(struct cred *new)
* by verifing that the root directory is at the root of the
* mount namespace which allows all files to be accessed.
*/
+ ret = -EPERM;
if (current_chrooted())
- return -EPERM;
+ goto fail_dec;
/* The creator needs a mapping in the parent user namespace
* or else we won't be able to reasonably tell userspace who
* created a user_namespace.
*/
+ ret = -EPERM;
if (!kuid_has_mapping(parent_ns, owner) ||
!kgid_has_mapping(parent_ns, group))
- return -EPERM;
+ goto fail_dec;
+ ret = -ENOMEM;
ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
if (!ns)
- return -ENOMEM;
+ goto fail_dec;
ret = ns_alloc_inum(&ns->ns);
- if (ret) {
- kmem_cache_free(user_ns_cachep, ns);
- return ret;
- }
+ if (ret)
+ goto fail_free;
ns->ns.ops = &userns_operations;
atomic_set(&ns->count, 1);
@@ -101,18 +119,37 @@ int create_user_ns(struct cred *new)
ns->level = parent_ns->level + 1;
ns->owner = owner;
ns->group = group;
+ INIT_WORK(&ns->work, free_user_ns);
+ for (i = 0; i < UCOUNT_COUNTS; i++) {
+ ns->ucount_max[i] = INT_MAX;
+ }
+ ns->ucounts = ucounts;
/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
mutex_lock(&userns_state_mutex);
ns->flags = parent_ns->flags;
mutex_unlock(&userns_state_mutex);
- set_cred_user_ns(new, ns);
-
#ifdef CONFIG_PERSISTENT_KEYRINGS
init_rwsem(&ns->persistent_keyring_register_sem);
#endif
+ ret = -ENOMEM;
+ if (!setup_userns_sysctls(ns))
+ goto fail_keyring;
+
+ set_cred_user_ns(new, ns);
return 0;
+fail_keyring:
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+ key_put(ns->persistent_keyring_register);
+#endif
+ ns_free_inum(&ns->ns);
+fail_free:
+ kmem_cache_free(user_ns_cachep, ns);
+fail_dec:
+ dec_user_namespaces(ucounts);
+fail:
+ return ret;
}
int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
@@ -135,21 +172,30 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
return err;
}
-void free_user_ns(struct user_namespace *ns)
+static void free_user_ns(struct work_struct *work)
{
- struct user_namespace *parent;
+ struct user_namespace *parent, *ns =
+ container_of(work, struct user_namespace, work);
do {
+ struct ucounts *ucounts = ns->ucounts;
parent = ns->parent;
+ retire_userns_sysctls(ns);
#ifdef CONFIG_PERSISTENT_KEYRINGS
key_put(ns->persistent_keyring_register);
#endif
ns_free_inum(&ns->ns);
kmem_cache_free(user_ns_cachep, ns);
+ dec_user_namespaces(ucounts);
ns = parent;
} while (atomic_dec_and_test(&parent->count));
}
-EXPORT_SYMBOL(free_user_ns);
+
+void __put_user_ns(struct user_namespace *ns)
+{
+ schedule_work(&ns->work);
+}
+EXPORT_SYMBOL(__put_user_ns);
static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
{
@@ -1004,12 +1050,37 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return commit_creds(cred);
}
+struct ns_common *ns_get_owner(struct ns_common *ns)
+{
+ struct user_namespace *my_user_ns = current_user_ns();
+ struct user_namespace *owner, *p;
+
+ /* See if the owner is in the current user namespace */
+ owner = p = ns->ops->owner(ns);
+ for (;;) {
+ if (!p)
+ return ERR_PTR(-EPERM);
+ if (p == my_user_ns)
+ break;
+ p = p->parent;
+ }
+
+ return &get_user_ns(owner)->ns;
+}
+
+static struct user_namespace *userns_owner(struct ns_common *ns)
+{
+ return to_user_ns(ns)->parent;
+}
+
const struct proc_ns_operations userns_operations = {
.name = "user",
.type = CLONE_NEWUSER,
.get = userns_get,
.put = userns_put,
.install = userns_install,
+ .owner = userns_owner,
+ .get_parent = ns_get_owner,
};
static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 831ea7108232..6976cd47dcf6 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -17,6 +17,16 @@
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
+static struct ucounts *inc_uts_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES);
+}
+
+static void dec_uts_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES);
+}
+
static struct uts_namespace *create_uts_ns(void)
{
struct uts_namespace *uts_ns;
@@ -36,18 +46,24 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
struct uts_namespace *old_ns)
{
struct uts_namespace *ns;
+ struct ucounts *ucounts;
int err;
+ err = -ENOSPC;
+ ucounts = inc_uts_namespaces(user_ns);
+ if (!ucounts)
+ goto fail;
+
+ err = -ENOMEM;
ns = create_uts_ns();
if (!ns)
- return ERR_PTR(-ENOMEM);
+ goto fail_dec;
err = ns_alloc_inum(&ns->ns);
- if (err) {
- kfree(ns);
- return ERR_PTR(err);
- }
+ if (err)
+ goto fail_free;
+ ns->ucounts = ucounts;
ns->ns.ops = &utsns_operations;
down_read(&uts_sem);
@@ -55,6 +71,13 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
ns->user_ns = get_user_ns(user_ns);
up_read(&uts_sem);
return ns;
+
+fail_free:
+ kfree(ns);
+fail_dec:
+ dec_uts_namespaces(ucounts);
+fail:
+ return ERR_PTR(err);
}
/*
@@ -85,6 +108,7 @@ void free_uts_ns(struct kref *kref)
struct uts_namespace *ns;
ns = container_of(kref, struct uts_namespace, kref);
+ dec_uts_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
kfree(ns);
@@ -130,10 +154,16 @@ static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
return 0;
}
+static struct user_namespace *utsns_owner(struct ns_common *ns)
+{
+ return to_uts_ns(ns)->user_ns;
+}
+
const struct proc_ns_operations utsns_operations = {
.name = "uts",
.type = CLONE_NEWUTS,
.get = utsns_get,
.put = utsns_put,
.install = utsns_install,
+ .owner = utsns_owner,
};
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9acb29f280ec..63177be0159e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,32 +24,14 @@
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
-#include <linux/perf_event.h>
#include <linux/kthread.h>
-/*
- * The run state of the lockup detectors is controlled by the content of the
- * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
- * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
- *
- * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
- * are variables that are only used as an 'interface' between the parameters
- * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
- * 'watchdog_thresh' variable is handled differently because its value is not
- * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
- * is equal zero.
- */
-#define NMI_WATCHDOG_ENABLED_BIT 0
-#define SOFT_WATCHDOG_ENABLED_BIT 1
-#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT)
-#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT)
-
static DEFINE_MUTEX(watchdog_proc_mutex);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
#else
-static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
#endif
int __read_mostly nmi_watchdog_enabled;
int __read_mostly soft_watchdog_enabled;
@@ -59,9 +41,6 @@ int __read_mostly watchdog_thresh = 10;
#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
-#else
-#define sysctl_softlockup_all_cpu_backtrace 0
-#define sysctl_hardlockup_all_cpu_backtrace 0
#endif
static struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -70,6 +49,8 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
#define for_each_watchdog_cpu(cpu) \
for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
+atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
+
/*
* The 'watchdog_running' variable is set to 1 when the watchdog threads
* are registered/started and is set to 0 when the watchdog threads are
@@ -100,50 +81,9 @@ static DEFINE_PER_CPU(bool, soft_watchdog_warn);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static DEFINE_PER_CPU(bool, hard_watchdog_warn);
-static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
-static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-#endif
static unsigned long soft_lockup_nmi_warn;
-/* boot commands */
-/*
- * Should we panic when a soft-lockup or hard-lockup occurs:
- */
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-unsigned int __read_mostly hardlockup_panic =
- CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
-static unsigned long hardlockup_allcpu_dumped;
-/*
- * We may not want to enable hard lockup detection by default in all cases,
- * for example when running the kernel as a guest on a hypervisor. In these
- * cases this function can be called to disable hard lockup detection. This
- * function should only be executed once by the boot processor before the
- * kernel command line parameters are parsed, because otherwise it is not
- * possible to override this in hardlockup_panic_setup().
- */
-void hardlockup_detector_disable(void)
-{
- watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
-}
-
-static int __init hardlockup_panic_setup(char *str)
-{
- if (!strncmp(str, "panic", 5))
- hardlockup_panic = 1;
- else if (!strncmp(str, "nopanic", 7))
- hardlockup_panic = 0;
- else if (!strncmp(str, "0", 1))
- watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
- else if (!strncmp(str, "1", 1))
- watchdog_enabled |= NMI_WATCHDOG_ENABLED;
- return 1;
-}
-__setup("nmi_watchdog=", hardlockup_panic_setup);
-#endif
-
unsigned int __read_mostly softlockup_panic =
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
@@ -264,32 +204,14 @@ void touch_all_softlockup_watchdogs(void)
wq_watchdog_touch(-1);
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-void touch_nmi_watchdog(void)
-{
- /*
- * Using __raw here because some code paths have
- * preemption enabled. If preemption is enabled
- * then interrupts should be enabled too, in which
- * case we shouldn't have to worry about the watchdog
- * going off.
- */
- raw_cpu_write(watchdog_nmi_touch, true);
- touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-
-#endif
-
void touch_softlockup_watchdog_sync(void)
{
__this_cpu_write(softlockup_touch_sync, true);
__this_cpu_write(watchdog_touch_ts, 0);
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
/* watchdog detector functions */
-static bool is_hardlockup(void)
+bool is_hardlockup(void)
{
unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
@@ -299,7 +221,6 @@ static bool is_hardlockup(void)
__this_cpu_write(hrtimer_interrupts_saved, hrint);
return false;
}
-#endif
static int is_softlockup(unsigned long touch_ts)
{
@@ -313,78 +234,22 @@ static int is_softlockup(unsigned long touch_ts)
return 0;
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-
-static struct perf_event_attr wd_hw_attr = {
- .type = PERF_TYPE_HARDWARE,
- .config = PERF_COUNT_HW_CPU_CYCLES,
- .size = sizeof(struct perf_event_attr),
- .pinned = 1,
- .disabled = 1,
-};
-
-/* Callback function for perf event subsystem */
-static void watchdog_overflow_callback(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
-{
- /* Ensure the watchdog never gets throttled */
- event->hw.interrupts = 0;
-
- if (__this_cpu_read(watchdog_nmi_touch) == true) {
- __this_cpu_write(watchdog_nmi_touch, false);
- return;
- }
-
- /* check for a hardlockup
- * This is done by making sure our timer interrupt
- * is incrementing. The timer interrupt should have
- * fired multiple times before we overflow'd. If it hasn't
- * then this is a good indication the cpu is stuck
- */
- if (is_hardlockup()) {
- int this_cpu = smp_processor_id();
- struct pt_regs *regs = get_irq_regs();
-
- /* only print hardlockups once */
- if (__this_cpu_read(hard_watchdog_warn) == true)
- return;
-
- pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
- print_modules();
- print_irqtrace_events(current);
- if (regs)
- show_regs(regs);
- else
- dump_stack();
-
- /*
- * Perform all-CPU dump only once to avoid multiple hardlockups
- * generating interleaving traces
- */
- if (sysctl_hardlockup_all_cpu_backtrace &&
- !test_and_set_bit(0, &hardlockup_allcpu_dumped))
- trigger_allbutself_cpu_backtrace();
-
- if (hardlockup_panic)
- nmi_panic(regs, "Hard LOCKUP");
-
- __this_cpu_write(hard_watchdog_warn, true);
- return;
- }
-
- __this_cpu_write(hard_watchdog_warn, false);
- return;
-}
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
-
static void watchdog_interrupt_count(void)
{
__this_cpu_inc(hrtimer_interrupts);
}
-static int watchdog_nmi_enable(unsigned int cpu);
-static void watchdog_nmi_disable(unsigned int cpu);
+/*
+ * These two functions are mostly architecture specific
+ * defining them as weak here.
+ */
+int __weak watchdog_nmi_enable(unsigned int cpu)
+{
+ return 0;
+}
+void __weak watchdog_nmi_disable(unsigned int cpu)
+{
+}
static int watchdog_enable_all_cpus(void);
static void watchdog_disable_all_cpus(void);
@@ -397,6 +262,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
+ if (atomic_read(&watchdog_park_in_progress) != 0)
+ return HRTIMER_NORESTART;
+
/* kick the hardlockup detector */
watchdog_interrupt_count();
@@ -577,109 +445,6 @@ static void watchdog(unsigned int cpu)
watchdog_nmi_disable(cpu);
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-/*
- * People like the simple clean cpu node info on boot.
- * Reduce the watchdog noise by only printing messages
- * that are different from what cpu0 displayed.
- */
-static unsigned long cpu0_err;
-
-static int watchdog_nmi_enable(unsigned int cpu)
-{
- struct perf_event_attr *wd_attr;
- struct perf_event *event = per_cpu(watchdog_ev, cpu);
-
- /* nothing to do if the hard lockup detector is disabled */
- if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
- goto out;
-
- /* is it already setup and enabled? */
- if (event && event->state > PERF_EVENT_STATE_OFF)
- goto out;
-
- /* it is setup but not enabled */
- if (event != NULL)
- goto out_enable;
-
- wd_attr = &wd_hw_attr;
- wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
-
- /* Try to register using hardware perf events */
- event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
-
- /* save cpu0 error for future comparision */
- if (cpu == 0 && IS_ERR(event))
- cpu0_err = PTR_ERR(event);
-
- if (!IS_ERR(event)) {
- /* only print for cpu0 or different than cpu0 */
- if (cpu == 0 || cpu0_err)
- pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
- goto out_save;
- }
-
- /*
- * Disable the hard lockup detector if _any_ CPU fails to set up
- * set up the hardware perf event. The watchdog() function checks
- * the NMI_WATCHDOG_ENABLED bit periodically.
- *
- * The barriers are for syncing up watchdog_enabled across all the
- * cpus, as clear_bit() does not use barriers.
- */
- smp_mb__before_atomic();
- clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
- smp_mb__after_atomic();
-
- /* skip displaying the same error again */
- if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
- return PTR_ERR(event);
-
- /* vary the KERN level based on the returned errno */
- if (PTR_ERR(event) == -EOPNOTSUPP)
- pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
- else if (PTR_ERR(event) == -ENOENT)
- pr_warn("disabled (cpu%i): hardware events not enabled\n",
- cpu);
- else
- pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
- cpu, PTR_ERR(event));
-
- pr_info("Shutting down hard lockup detector on all cpus\n");
-
- return PTR_ERR(event);
-
- /* success path */
-out_save:
- per_cpu(watchdog_ev, cpu) = event;
-out_enable:
- perf_event_enable(per_cpu(watchdog_ev, cpu));
-out:
- return 0;
-}
-
-static void watchdog_nmi_disable(unsigned int cpu)
-{
- struct perf_event *event = per_cpu(watchdog_ev, cpu);
-
- if (event) {
- perf_event_disable(event);
- per_cpu(watchdog_ev, cpu) = NULL;
-
- /* should be in cleanup, but blocks oprofile */
- perf_event_release_kernel(event);
- }
- if (cpu == 0) {
- /* watchdog_nmi_enable() expects this to be zero initially. */
- cpu0_err = 0;
- }
-}
-
-#else
-static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
-static void watchdog_nmi_disable(unsigned int cpu) { return; }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
-
static struct smp_hotplug_thread watchdog_threads = {
.store = &softlockup_watchdog,
.thread_should_run = watchdog_should_run,
@@ -707,12 +472,16 @@ static int watchdog_park_threads(void)
{
int cpu, ret = 0;
+ atomic_set(&watchdog_park_in_progress, 1);
+
for_each_watchdog_cpu(cpu) {
ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
if (ret)
break;
}
+ atomic_set(&watchdog_park_in_progress, 0);
+
return ret;
}
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
new file mode 100644
index 000000000000..12b8dd640786
--- /dev/null
+++ b/kernel/watchdog_hld.c
@@ -0,0 +1,230 @@
+/*
+ * Detect hard lockups on a system
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * Note: Most of this code is borrowed heavily from the original softlockup
+ * detector, so thanks to Ingo for the initial implementation.
+ * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
+ * to those contributors as well.
+ */
+
+#define pr_fmt(fmt) "NMI watchdog: " fmt
+
+#include <linux/nmi.h>
+#include <linux/module.h>
+#include <asm/irq_regs.h>
+#include <linux/perf_event.h>
+
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
+static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+
+/* boot commands */
+/*
+ * Should we panic when a soft-lockup or hard-lockup occurs:
+ */
+unsigned int __read_mostly hardlockup_panic =
+ CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+static unsigned long hardlockup_allcpu_dumped;
+/*
+ * We may not want to enable hard lockup detection by default in all cases,
+ * for example when running the kernel as a guest on a hypervisor. In these
+ * cases this function can be called to disable hard lockup detection. This
+ * function should only be executed once by the boot processor before the
+ * kernel command line parameters are parsed, because otherwise it is not
+ * possible to override this in hardlockup_panic_setup().
+ */
+void hardlockup_detector_disable(void)
+{
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+}
+
+static int __init hardlockup_panic_setup(char *str)
+{
+ if (!strncmp(str, "panic", 5))
+ hardlockup_panic = 1;
+ else if (!strncmp(str, "nopanic", 7))
+ hardlockup_panic = 0;
+ else if (!strncmp(str, "0", 1))
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+ else if (!strncmp(str, "1", 1))
+ watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+ return 1;
+}
+__setup("nmi_watchdog=", hardlockup_panic_setup);
+
+void touch_nmi_watchdog(void)
+{
+ /*
+ * Using __raw here because some code paths have
+ * preemption enabled. If preemption is enabled
+ * then interrupts should be enabled too, in which
+ * case we shouldn't have to worry about the watchdog
+ * going off.
+ */
+ raw_cpu_write(watchdog_nmi_touch, true);
+ touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+
+static struct perf_event_attr wd_hw_attr = {
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 1,
+};
+
+/* Callback function for perf event subsystem */
+static void watchdog_overflow_callback(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ /* Ensure the watchdog never gets throttled */
+ event->hw.interrupts = 0;
+
+ if (atomic_read(&watchdog_park_in_progress) != 0)
+ return;
+
+ if (__this_cpu_read(watchdog_nmi_touch) == true) {
+ __this_cpu_write(watchdog_nmi_touch, false);
+ return;
+ }
+
+ /* check for a hardlockup
+ * This is done by making sure our timer interrupt
+ * is incrementing. The timer interrupt should have
+ * fired multiple times before we overflow'd. If it hasn't
+ * then this is a good indication the cpu is stuck
+ */
+ if (is_hardlockup()) {
+ int this_cpu = smp_processor_id();
+
+ /* only print hardlockups once */
+ if (__this_cpu_read(hard_watchdog_warn) == true)
+ return;
+
+ pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ print_modules();
+ print_irqtrace_events(current);
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+
+ /*
+ * Perform all-CPU dump only once to avoid multiple hardlockups
+ * generating interleaving traces
+ */
+ if (sysctl_hardlockup_all_cpu_backtrace &&
+ !test_and_set_bit(0, &hardlockup_allcpu_dumped))
+ trigger_allbutself_cpu_backtrace();
+
+ if (hardlockup_panic)
+ nmi_panic(regs, "Hard LOCKUP");
+
+ __this_cpu_write(hard_watchdog_warn, true);
+ return;
+ }
+
+ __this_cpu_write(hard_watchdog_warn, false);
+ return;
+}
+
+/*
+ * People like the simple clean cpu node info on boot.
+ * Reduce the watchdog noise by only printing messages
+ * that are different from what cpu0 displayed.
+ */
+static unsigned long cpu0_err;
+
+int watchdog_nmi_enable(unsigned int cpu)
+{
+ struct perf_event_attr *wd_attr;
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ /* nothing to do if the hard lockup detector is disabled */
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ goto out;
+
+ /* is it already setup and enabled? */
+ if (event && event->state > PERF_EVENT_STATE_OFF)
+ goto out;
+
+ /* it is setup but not enabled */
+ if (event != NULL)
+ goto out_enable;
+
+ wd_attr = &wd_hw_attr;
+ wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+
+ /* Try to register using hardware perf events */
+ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+
+ /* save cpu0 error for future comparision */
+ if (cpu == 0 && IS_ERR(event))
+ cpu0_err = PTR_ERR(event);
+
+ if (!IS_ERR(event)) {
+ /* only print for cpu0 or different than cpu0 */
+ if (cpu == 0 || cpu0_err)
+ pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
+ goto out_save;
+ }
+
+ /*
+ * Disable the hard lockup detector if _any_ CPU fails to set up
+ * set up the hardware perf event. The watchdog() function checks
+ * the NMI_WATCHDOG_ENABLED bit periodically.
+ *
+ * The barriers are for syncing up watchdog_enabled across all the
+ * cpus, as clear_bit() does not use barriers.
+ */
+ smp_mb__before_atomic();
+ clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
+ smp_mb__after_atomic();
+
+ /* skip displaying the same error again */
+ if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
+ return PTR_ERR(event);
+
+ /* vary the KERN level based on the returned errno */
+ if (PTR_ERR(event) == -EOPNOTSUPP)
+ pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+ else if (PTR_ERR(event) == -ENOENT)
+ pr_warn("disabled (cpu%i): hardware events not enabled\n",
+ cpu);
+ else
+ pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
+ cpu, PTR_ERR(event));
+
+ pr_info("Shutting down hard lockup detector on all cpus\n");
+
+ return PTR_ERR(event);
+
+ /* success path */
+out_save:
+ per_cpu(watchdog_ev, cpu) = event;
+out_enable:
+ perf_event_enable(per_cpu(watchdog_ev, cpu));
+out:
+ return 0;
+}
+
+void watchdog_nmi_disable(unsigned int cpu)
+{
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ if (event) {
+ perf_event_disable(event);
+ per_cpu(watchdog_ev, cpu) = NULL;
+
+ /* should be in cleanup, but blocks oprofile */
+ perf_event_release_kernel(event);
+ }
+ if (cpu == 0) {
+ /* watchdog_nmi_enable() expects this to be zero initially. */
+ cpu0_err = 0;
+ }
+}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ef071ca73fc3..072cbc9b175d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -290,6 +290,8 @@ module_param_named(disable_numa, wq_disable_numa, bool, 0444);
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
module_param_named(power_efficient, wq_power_efficient, bool, 0444);
+static bool wq_online; /* can kworkers be created yet? */
+
static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
@@ -1521,8 +1523,6 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
return;
}
- timer_stats_timer_set_start_info(&dwork->timer);
-
dwork->wq = wq;
dwork->cpu = cpu;
timer->expires = jiffies + delay;
@@ -2583,6 +2583,9 @@ void flush_workqueue(struct workqueue_struct *wq)
};
int next_color;
+ if (WARN_ON(!wq_online))
+ return;
+
lock_map_acquire(&wq->lockdep_map);
lock_map_release(&wq->lockdep_map);
@@ -2843,6 +2846,9 @@ bool flush_work(struct work_struct *work)
{
struct wq_barrier barr;
+ if (WARN_ON(!wq_online))
+ return false;
+
lock_map_acquire(&work->lockdep_map);
lock_map_release(&work->lockdep_map);
@@ -2913,7 +2919,13 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
mark_work_canceling(work);
local_irq_restore(flags);
- flush_work(work);
+ /*
+ * This allows canceling during early boot. We know that @work
+ * isn't executing.
+ */
+ if (wq_online)
+ flush_work(work);
+
clear_work_data(work);
/*
@@ -2974,6 +2986,31 @@ bool flush_delayed_work(struct delayed_work *dwork)
}
EXPORT_SYMBOL(flush_delayed_work);
+static bool __cancel_work(struct work_struct *work, bool is_dwork)
+{
+ unsigned long flags;
+ int ret;
+
+ do {
+ ret = try_to_grab_pending(work, is_dwork, &flags);
+ } while (unlikely(ret == -EAGAIN));
+
+ if (unlikely(ret < 0))
+ return false;
+
+ set_work_pool_and_clear_pending(work, get_work_pool_id(work));
+ local_irq_restore(flags);
+ return ret;
+}
+
+/*
+ * See cancel_delayed_work()
+ */
+bool cancel_work(struct work_struct *work)
+{
+ return __cancel_work(work, false);
+}
+
/**
* cancel_delayed_work - cancel a delayed work
* @dwork: delayed_work to cancel
@@ -2992,20 +3029,7 @@ EXPORT_SYMBOL(flush_delayed_work);
*/
bool cancel_delayed_work(struct delayed_work *dwork)
{
- unsigned long flags;
- int ret;
-
- do {
- ret = try_to_grab_pending(&dwork->work, true, &flags);
- } while (unlikely(ret == -EAGAIN));
-
- if (unlikely(ret < 0))
- return false;
-
- set_work_pool_and_clear_pending(&dwork->work,
- get_work_pool_id(&dwork->work));
- local_irq_restore(flags);
- return ret;
+ return __cancel_work(&dwork->work, true);
}
EXPORT_SYMBOL(cancel_delayed_work);
@@ -3352,7 +3376,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
goto fail;
/* create and start the initial worker */
- if (!create_worker(pool))
+ if (wq_online && !create_worker(pool))
goto fail;
/* install */
@@ -3417,6 +3441,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
{
struct workqueue_struct *wq = pwq->wq;
bool freezable = wq->flags & WQ_FREEZABLE;
+ unsigned long flags;
/* for @wq->saved_max_active */
lockdep_assert_held(&wq->mutex);
@@ -3425,7 +3450,8 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
if (!freezable && pwq->max_active == wq->saved_max_active)
return;
- spin_lock_irq(&pwq->pool->lock);
+ /* this function can be called during early boot w/ irq disabled */
+ spin_lock_irqsave(&pwq->pool->lock, flags);
/*
* During [un]freezing, the caller is responsible for ensuring that
@@ -3448,7 +3474,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
pwq->max_active = 0;
}
- spin_unlock_irq(&pwq->pool->lock);
+ spin_unlock_irqrestore(&pwq->pool->lock, flags);
}
/* initialize newly alloced @pwq which is associated with @wq and @pool */
@@ -4021,6 +4047,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
for (i = 0; i < WORK_NR_COLORS; i++) {
if (WARN_ON(pwq->nr_in_flight[i])) {
mutex_unlock(&wq->mutex);
+ show_workqueue_state();
return;
}
}
@@ -4029,6 +4056,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
WARN_ON(pwq->nr_active) ||
WARN_ON(!list_empty(&pwq->delayed_works))) {
mutex_unlock(&wq->mutex);
+ show_workqueue_state();
return;
}
}
@@ -4249,7 +4277,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
* This function is called without any synchronization and @task
* could be in any state. Be careful with dereferences.
*/
- worker = probe_kthread_data(task);
+ worker = kthread_probe_data(task);
/*
* Carefully copy the associated workqueue's workfn and name. Keep
@@ -5455,7 +5483,17 @@ static void __init wq_numa_init(void)
wq_numa_enabled = true;
}
-static int __init init_workqueues(void)
+/**
+ * workqueue_init_early - early init for workqueue subsystem
+ *
+ * This is the first half of two-staged workqueue subsystem initialization
+ * and invoked as soon as the bare basics - memory allocation, cpumasks and
+ * idr are up. It sets up all the data structures and system workqueues
+ * and allows early boot code to create workqueues and queue/cancel work
+ * items. Actual work item execution starts only after kthreads can be
+ * created and scheduled right before early initcalls.
+ */
+int __init workqueue_init_early(void)
{
int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
int i, cpu;
@@ -5467,8 +5505,6 @@ static int __init init_workqueues(void)
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
- wq_numa_init();
-
/* initialize CPU pools */
for_each_possible_cpu(cpu) {
struct worker_pool *pool;
@@ -5488,16 +5524,6 @@ static int __init init_workqueues(void)
}
}
- /* create the initial worker */
- for_each_online_cpu(cpu) {
- struct worker_pool *pool;
-
- for_each_cpu_worker_pool(pool, cpu) {
- pool->flags &= ~POOL_DISASSOCIATED;
- BUG_ON(!create_worker(pool));
- }
- }
-
/* create default unbound and ordered wq attrs */
for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
struct workqueue_attrs *attrs;
@@ -5534,8 +5560,59 @@ static int __init init_workqueues(void)
!system_power_efficient_wq ||
!system_freezable_power_efficient_wq);
+ return 0;
+}
+
+/**
+ * workqueue_init - bring workqueue subsystem fully online
+ *
+ * This is the latter half of two-staged workqueue subsystem initialization
+ * and invoked as soon as kthreads can be created and scheduled.
+ * Workqueues have been created and work items queued on them, but there
+ * are no kworkers executing the work items yet. Populate the worker pools
+ * with the initial workers and enable future kworker creations.
+ */
+int __init workqueue_init(void)
+{
+ struct workqueue_struct *wq;
+ struct worker_pool *pool;
+ int cpu, bkt;
+
+ /*
+ * It'd be simpler to initialize NUMA in workqueue_init_early() but
+ * CPU to node mapping may not be available that early on some
+ * archs such as power and arm64. As per-cpu pools created
+ * previously could be missing node hint and unbound pools NUMA
+ * affinity, fix them up.
+ */
+ wq_numa_init();
+
+ mutex_lock(&wq_pool_mutex);
+
+ for_each_possible_cpu(cpu) {
+ for_each_cpu_worker_pool(pool, cpu) {
+ pool->node = cpu_to_node(cpu);
+ }
+ }
+
+ list_for_each_entry(wq, &workqueues, list)
+ wq_update_unbound_numa(wq, smp_processor_id(), true);
+
+ mutex_unlock(&wq_pool_mutex);
+
+ /* create the initial workers */
+ for_each_online_cpu(cpu) {
+ for_each_cpu_worker_pool(pool, cpu) {
+ pool->flags &= ~POOL_DISASSOCIATED;
+ BUG_ON(!create_worker(pool));
+ }
+ }
+
+ hash_for_each(unbound_pool_hash, bkt, pool, hash_node)
+ BUG_ON(!create_worker(pool));
+
+ wq_online = true;
wq_watchdog_init();
return 0;
}
-early_initcall(init_workqueues);