summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-02-20 17:41:08 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2023-02-20 17:41:08 -0800
commit1f2d9ffc7a5f916935749ffc6e93fb33bfe94d2f (patch)
treea5dabaa924d50867cbe347e20a7643b2850f11c0 /include
parenta2f0e7eee1344eb9f91b22bc72d9eb0a52b849c9 (diff)
parent7c4a5b89a0b5a57a64b601775b296abf77a9fe97 (diff)
Merge tag 'sched-core-2023-02-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - Improve the scalability of the CFS bandwidth unthrottling logic with large number of CPUs. - Fix & rework various cpuidle routines, simplify interaction with the generic scheduler code. Add __cpuidle methods as noinstr to objtool's noinstr detection and fix boatloads of cpuidle bugs & quirks. - Add new ABI: introduce MEMBARRIER_CMD_GET_REGISTRATIONS, to query previously issued registrations. - Limit scheduler slice duration to the sysctl_sched_latency period, to improve scheduling granularity with a large number of SCHED_IDLE tasks. - Debuggability enhancement on sys_exit(): warn about disabled IRQs, but also enable them to prevent a cascade of followup problems and repeat warnings. - Fix the rescheduling logic in prio_changed_dl(). - Micro-optimize cpufreq and sched-util methods. - Micro-optimize ttwu_runnable() - Micro-optimize the idle-scanning in update_numa_stats(), select_idle_capacity() and steal_cookie_task(). - Update the RSEQ code & self-tests - Constify various scheduler methods - Remove unused methods - Refine __init tags - Documentation updates - Misc other cleanups, fixes * tag 'sched-core-2023-02-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (110 commits) sched/rt: pick_next_rt_entity(): check list_entry sched/deadline: Add more reschedule cases to prio_changed_dl() sched/fair: sanitize vruntime of entity being placed sched/fair: Remove capacity inversion detection sched/fair: unlink misfit task from cpu overutilized objtool: mem*() are not uaccess safe cpuidle: Fix poll_idle() noinstr annotation sched/clock: Make local_clock() noinstr sched/clock/x86: Mark sched_clock() noinstr x86/pvclock: Improve atomic update of last_value in pvclock_clocksource_read() x86/atomics: Always inline arch_atomic64*() cpuidle: tracing, preempt: Squash _rcuidle tracing cpuidle: tracing: Warn about !rcu_is_watching() cpuidle: lib/bug: Disable rcu_is_watching() during WARN/BUG cpuidle: drivers: firmware: psci: Dont instrument suspend code KVM: selftests: Fix build of rseq test exit: Detect and fix irq disabled state in oops cpuidle, arm64: Fix the ARM64 cpuidle logic cpuidle: mvebu: Fix duplicate flags assignment sched/fair: Limit sched slice duration ...
Diffstat (limited to 'include')
-rw-r--r--include/asm-generic/vmlinux.lds.h9
-rw-r--r--include/linux/auxvec.h2
-rw-r--r--include/linux/clockchips.h4
-rw-r--r--include/linux/compiler_types.h18
-rw-r--r--include/linux/context_tracking.h27
-rw-r--r--include/linux/cpu.h3
-rw-r--r--include/linux/cpuidle.h50
-rw-r--r--include/linux/cpumask.h4
-rw-r--r--include/linux/math64.h4
-rw-r--r--include/linux/mm.h25
-rw-r--r--include/linux/mm_types.h43
-rw-r--r--include/linux/percpu-defs.h2
-rw-r--r--include/linux/sched.h9
-rw-r--r--include/linux/sched/clock.h8
-rw-r--r--include/linux/sched/cputime.h9
-rw-r--r--include/linux/sched/idle.h40
-rw-r--r--include/linux/thread_info.h18
-rw-r--r--include/linux/trace_recursion.h18
-rw-r--r--include/linux/tracepoint.h15
-rw-r--r--include/trace/events/rseq.h7
-rw-r--r--include/uapi/linux/auxvec.h2
-rw-r--r--include/uapi/linux/membarrier.h4
-rw-r--r--include/uapi/linux/rseq.h22
23 files changed, 290 insertions, 53 deletions
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 659bf3b31c91..d1f57e4868ed 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -558,6 +558,9 @@
ALIGN_FUNCTION(); \
__noinstr_text_start = .; \
*(.noinstr.text) \
+ __cpuidle_text_start = .; \
+ *(.cpuidle.text) \
+ __cpuidle_text_end = .; \
__noinstr_text_end = .;
/*
@@ -598,12 +601,6 @@
*(.spinlock.text) \
__lock_text_end = .;
-#define CPUIDLE_TEXT \
- ALIGN_FUNCTION(); \
- __cpuidle_text_start = .; \
- *(.cpuidle.text) \
- __cpuidle_text_end = .;
-
#define KPROBES_TEXT \
ALIGN_FUNCTION(); \
__kprobes_text_start = .; \
diff --git a/include/linux/auxvec.h b/include/linux/auxvec.h
index f68d0ec2d740..407f7005e6d6 100644
--- a/include/linux/auxvec.h
+++ b/include/linux/auxvec.h
@@ -4,6 +4,6 @@
#include <uapi/linux/auxvec.h>
-#define AT_VECTOR_SIZE_BASE 20 /* NEW_AUX_ENT entries in auxiliary table */
+#define AT_VECTOR_SIZE_BASE 22 /* NEW_AUX_ENT entries in auxiliary table */
/* number of "#define AT_.*" above, minus {AT_NULL, AT_IGNORE, AT_NOTELF} */
#endif /* _LINUX_AUXVEC_H */
diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 8ae9a95ebf5b..9aac31d856f3 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -211,7 +211,7 @@ extern int tick_receive_broadcast(void);
extern void tick_setup_hrtimer_broadcast(void);
extern int tick_check_broadcast_expired(void);
# else
-static inline int tick_check_broadcast_expired(void) { return 0; }
+static __always_inline int tick_check_broadcast_expired(void) { return 0; }
static inline void tick_setup_hrtimer_broadcast(void) { }
# endif
@@ -219,7 +219,7 @@ static inline void tick_setup_hrtimer_broadcast(void) { }
static inline void clockevents_suspend(void) { }
static inline void clockevents_resume(void) { }
-static inline int tick_check_broadcast_expired(void) { return 0; }
+static __always_inline int tick_check_broadcast_expired(void) { return 0; }
static inline void tick_setup_hrtimer_broadcast(void) { }
#endif /* !CONFIG_GENERIC_CLOCKEVENTS */
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 7c1afe0f4129..dea5bf5bd09c 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -232,11 +232,25 @@ struct ftrace_likely_data {
#endif
/* Section for code which can't be instrumented at all */
-#define noinstr \
- noinline notrace __attribute((__section__(".noinstr.text"))) \
+#define __noinstr_section(section) \
+ noinline notrace __attribute((__section__(section))) \
__no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage \
__no_sanitize_memory
+#define noinstr __noinstr_section(".noinstr.text")
+
+/*
+ * The __cpuidle section is used twofold:
+ *
+ * 1) the original use -- identifying if a CPU is 'stuck' in idle state based
+ * on it's instruction pointer. See cpu_in_idle().
+ *
+ * 2) supressing instrumentation around where cpuidle disables RCU; where the
+ * function isn't strictly required for #1, this is interchangeable with
+ * noinstr.
+ */
+#define __cpuidle __noinstr_section(".cpuidle.text")
+
#endif /* __KERNEL__ */
#endif /* __ASSEMBLY__ */
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index dcef4a9e4d63..d4afa8508a80 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -130,9 +130,36 @@ static __always_inline unsigned long ct_state_inc(int incby)
return arch_atomic_add_return(incby, this_cpu_ptr(&context_tracking.state));
}
+static __always_inline bool warn_rcu_enter(void)
+{
+ bool ret = false;
+
+ /*
+ * Horrible hack to shut up recursive RCU isn't watching fail since
+ * lots of the actual reporting also relies on RCU.
+ */
+ preempt_disable_notrace();
+ if (rcu_dynticks_curr_cpu_in_eqs()) {
+ ret = true;
+ ct_state_inc(RCU_DYNTICKS_IDX);
+ }
+
+ return ret;
+}
+
+static __always_inline void warn_rcu_exit(bool rcu)
+{
+ if (rcu)
+ ct_state_inc(RCU_DYNTICKS_IDX);
+ preempt_enable_notrace();
+}
+
#else
static inline void ct_idle_enter(void) { }
static inline void ct_idle_exit(void) { }
+
+static __always_inline bool warn_rcu_enter(void) { return false; }
+static __always_inline void warn_rcu_exit(bool rcu) { }
#endif /* !CONFIG_CONTEXT_TRACKING_IDLE */
#endif
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 314802f98b9d..f83e4519c5f0 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -176,9 +176,6 @@ void __noreturn cpu_startup_entry(enum cpuhp_state state);
void cpu_idle_poll_ctrl(bool enable);
-/* Attach to any functions which should be considered cpuidle. */
-#define __cpuidle __section(".cpuidle.text")
-
bool cpu_in_idle(unsigned long pc);
void arch_cpu_idle(void);
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index fce476275e16..3183aeb7f5b4 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -14,6 +14,7 @@
#include <linux/percpu.h>
#include <linux/list.h>
#include <linux/hrtimer.h>
+#include <linux/context_tracking.h>
#define CPUIDLE_STATE_MAX 10
#define CPUIDLE_NAME_LEN 16
@@ -115,6 +116,35 @@ struct cpuidle_device {
DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices);
DECLARE_PER_CPU(struct cpuidle_device, cpuidle_dev);
+static __always_inline void ct_cpuidle_enter(void)
+{
+ lockdep_assert_irqs_disabled();
+ /*
+ * Idle is allowed to (temporary) enable IRQs. It
+ * will return with IRQs disabled.
+ *
+ * Trace IRQs enable here, then switch off RCU, and have
+ * arch_cpu_idle() use raw_local_irq_enable(). Note that
+ * ct_idle_enter() relies on lockdep IRQ state, so switch that
+ * last -- this is very similar to the entry code.
+ */
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare();
+ instrumentation_end();
+ ct_idle_enter();
+ lockdep_hardirqs_on(_RET_IP_);
+}
+
+static __always_inline void ct_cpuidle_exit(void)
+{
+ /*
+ * Carefully undo the above.
+ */
+ lockdep_hardirqs_off(_RET_IP_);
+ ct_idle_exit();
+ instrumentation_begin();
+}
+
/****************************
* CPUIDLE DRIVER INTERFACE *
****************************/
@@ -277,7 +307,7 @@ extern s64 cpuidle_governor_latency_req(unsigned int cpu);
#define __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, \
idx, \
state, \
- is_retention) \
+ is_retention, is_rcu) \
({ \
int __ret = 0; \
\
@@ -289,7 +319,11 @@ extern s64 cpuidle_governor_latency_req(unsigned int cpu);
if (!is_retention) \
__ret = cpu_pm_enter(); \
if (!__ret) { \
+ if (!is_rcu) \
+ ct_cpuidle_enter(); \
__ret = low_level_idle_enter(state); \
+ if (!is_rcu) \
+ ct_cpuidle_exit(); \
if (!is_retention) \
cpu_pm_exit(); \
} \
@@ -298,15 +332,21 @@ extern s64 cpuidle_governor_latency_req(unsigned int cpu);
})
#define CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx) \
- __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, idx, 0)
+ __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, idx, 0, 0)
#define CPU_PM_CPU_IDLE_ENTER_RETENTION(low_level_idle_enter, idx) \
- __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, idx, 1)
+ __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, idx, 1, 0)
#define CPU_PM_CPU_IDLE_ENTER_PARAM(low_level_idle_enter, idx, state) \
- __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 0)
+ __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 0, 0)
+
+#define CPU_PM_CPU_IDLE_ENTER_PARAM_RCU(low_level_idle_enter, idx, state) \
+ __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 0, 1)
#define CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(low_level_idle_enter, idx, state) \
- __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 1)
+ __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 1, 0)
+
+#define CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM_RCU(low_level_idle_enter, idx, state) \
+ __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 1, 1)
#endif /* _LINUX_CPUIDLE_H */
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index c2aa0aa26b45..d45e5de13721 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -1017,9 +1017,9 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
* concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
* region.
*/
-static inline unsigned int num_online_cpus(void)
+static __always_inline unsigned int num_online_cpus(void)
{
- return atomic_read(&__num_online_cpus);
+ return arch_atomic_read(&__num_online_cpus);
}
#define num_possible_cpus() cpumask_weight(cpu_possible_mask)
#define num_present_cpus() cpumask_weight(cpu_present_mask)
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 8958f4c005c1..8b9191a2849e 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -161,7 +161,7 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
#ifndef mul_u64_u32_shr
-static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
+static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
{
return (u64)(((unsigned __int128)a * mul) >> shift);
}
@@ -177,7 +177,7 @@ static inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift)
#else
#ifndef mul_u64_u32_shr
-static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
+static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
{
u32 ah, al;
u64 ret;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bd3197748562..716d30d93616 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1982,6 +1982,31 @@ struct zap_details {
/* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */
#define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1))
+#ifdef CONFIG_SCHED_MM_CID
+void sched_mm_cid_before_execve(struct task_struct *t);
+void sched_mm_cid_after_execve(struct task_struct *t);
+void sched_mm_cid_fork(struct task_struct *t);
+void sched_mm_cid_exit_signals(struct task_struct *t);
+static inline int task_mm_cid(struct task_struct *t)
+{
+ return t->mm_cid;
+}
+#else
+static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
+static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
+static inline void sched_mm_cid_fork(struct task_struct *t) { }
+static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
+static inline int task_mm_cid(struct task_struct *t)
+{
+ /*
+ * Use the processor id as a fall-back when the mm cid feature is
+ * disabled. This provides functional per-cpu data structure accesses
+ * in user-space, althrough it won't provide the memory usage benefits.
+ */
+ return raw_smp_processor_id();
+}
+#endif
+
#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
#else
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9757067c3053..af8119776ab1 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -645,7 +645,18 @@ struct mm_struct {
* &struct mm_struct is freed.
*/
atomic_t mm_count;
-
+#ifdef CONFIG_SCHED_MM_CID
+ /**
+ * @cid_lock: Protect cid bitmap updates vs lookups.
+ *
+ * Prevent situations where updates to the cid bitmap happen
+ * concurrently with lookups. Those can lead to situations
+ * where a lookup cannot find a free bit simply because it was
+ * unlucky enough to load, non-atomically, bitmap words as they
+ * were being concurrently updated by the updaters.
+ */
+ raw_spinlock_t cid_lock;
+#endif
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes; /* PTE page table pages */
#endif
@@ -909,6 +920,36 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
vmi->mas.node = MAS_START;
}
+#ifdef CONFIG_SCHED_MM_CID
+/* Accessor for struct mm_struct's cidmask. */
+static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
+{
+ unsigned long cid_bitmap = (unsigned long)mm;
+
+ cid_bitmap += offsetof(struct mm_struct, cpu_bitmap);
+ /* Skip cpu_bitmap */
+ cid_bitmap += cpumask_size();
+ return (struct cpumask *)cid_bitmap;
+}
+
+static inline void mm_init_cid(struct mm_struct *mm)
+{
+ raw_spin_lock_init(&mm->cid_lock);
+ cpumask_clear(mm_cidmask(mm));
+}
+
+static inline unsigned int mm_cid_size(void)
+{
+ return cpumask_size();
+}
+#else /* CONFIG_SCHED_MM_CID */
+static inline void mm_init_cid(struct mm_struct *mm) { }
+static inline unsigned int mm_cid_size(void)
+{
+ return 0;
+}
+#endif /* CONFIG_SCHED_MM_CID */
+
struct mmu_gather;
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index af1071535de8..e60727be79c4 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -310,7 +310,7 @@ extern void __bad_size_call_parameter(void);
#ifdef CONFIG_DEBUG_PREEMPT
extern void __this_cpu_preempt_check(const char *op);
#else
-static inline void __this_cpu_preempt_check(const char *op) { }
+static __always_inline void __this_cpu_preempt_check(const char *op) { }
#endif
#define __pcpu_size_call_return(stem, variable) \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6f6ce9ca7097..63d242164b1a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1302,6 +1302,7 @@ struct task_struct {
#ifdef CONFIG_RSEQ
struct rseq __user *rseq;
+ u32 rseq_len;
u32 rseq_sig;
/*
* RmW on rseq_event_mask must be performed atomically
@@ -1310,6 +1311,11 @@ struct task_struct {
unsigned long rseq_event_mask;
#endif
+#ifdef CONFIG_SCHED_MM_CID
+ int mm_cid; /* Current cid in mm */
+ int mm_cid_active; /* Whether cid bitmap is active */
+#endif
+
struct tlbflush_unmap_batch tlb_ubc;
union {
@@ -2352,10 +2358,12 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
{
if (clone_flags & CLONE_VM) {
t->rseq = NULL;
+ t->rseq_len = 0;
t->rseq_sig = 0;
t->rseq_event_mask = 0;
} else {
t->rseq = current->rseq;
+ t->rseq_len = current->rseq_len;
t->rseq_sig = current->rseq_sig;
t->rseq_event_mask = current->rseq_event_mask;
}
@@ -2364,6 +2372,7 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
static inline void rseq_execve(struct task_struct *t)
{
t->rseq = NULL;
+ t->rseq_len = 0;
t->rseq_sig = 0;
t->rseq_event_mask = 0;
}
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 867d588314e0..ca008f7d3615 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -45,7 +45,7 @@ static inline u64 cpu_clock(int cpu)
return sched_clock();
}
-static inline u64 local_clock(void)
+static __always_inline u64 local_clock(void)
{
return sched_clock();
}
@@ -79,10 +79,8 @@ static inline u64 cpu_clock(int cpu)
return sched_clock_cpu(cpu);
}
-static inline u64 local_clock(void)
-{
- return sched_clock_cpu(raw_smp_processor_id());
-}
+extern u64 local_clock(void);
+
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
index ce3c58286062..5f8fd5b24a2e 100644
--- a/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@ -8,15 +8,6 @@
* cputime accounting APIs:
*/
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-#include <asm/cputime.h>
-
-#ifndef cputime_to_nsecs
-# define cputime_to_nsecs(__ct) \
- (cputime_to_usecs(__ct) * NSEC_PER_USEC)
-#endif
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
extern bool task_cputime(struct task_struct *t,
u64 *utime, u64 *stime);
diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
index d73d314d59c6..478084f9105e 100644
--- a/include/linux/sched/idle.h
+++ b/include/linux/sched/idle.h
@@ -23,12 +23,37 @@ static inline void wake_up_if_idle(int cpu) { }
*/
#ifdef TIF_POLLING_NRFLAG
-static inline void __current_set_polling(void)
+#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H
+
+static __always_inline void __current_set_polling(void)
{
- set_thread_flag(TIF_POLLING_NRFLAG);
+ arch_set_bit(TIF_POLLING_NRFLAG,
+ (unsigned long *)(&current_thread_info()->flags));
}
-static inline bool __must_check current_set_polling_and_test(void)
+static __always_inline void __current_clr_polling(void)
+{
+ arch_clear_bit(TIF_POLLING_NRFLAG,
+ (unsigned long *)(&current_thread_info()->flags));
+}
+
+#else
+
+static __always_inline void __current_set_polling(void)
+{
+ set_bit(TIF_POLLING_NRFLAG,
+ (unsigned long *)(&current_thread_info()->flags));
+}
+
+static __always_inline void __current_clr_polling(void)
+{
+ clear_bit(TIF_POLLING_NRFLAG,
+ (unsigned long *)(&current_thread_info()->flags));
+}
+
+#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H */
+
+static __always_inline bool __must_check current_set_polling_and_test(void)
{
__current_set_polling();
@@ -41,12 +66,7 @@ static inline bool __must_check current_set_polling_and_test(void)
return unlikely(tif_need_resched());
}
-static inline void __current_clr_polling(void)
-{
- clear_thread_flag(TIF_POLLING_NRFLAG);
-}
-
-static inline bool __must_check current_clr_polling_and_test(void)
+static __always_inline bool __must_check current_clr_polling_and_test(void)
{
__current_clr_polling();
@@ -73,7 +93,7 @@ static inline bool __must_check current_clr_polling_and_test(void)
}
#endif
-static inline void current_clr_polling(void)
+static __always_inline void current_clr_polling(void)
{
__current_clr_polling();
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 9f392ec76f2b..c02646884fa8 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -177,7 +177,23 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti
clear_ti_thread_flag(task_thread_info(t), TIF_##fl)
#endif /* !CONFIG_GENERIC_ENTRY */
-#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
+#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
+
+static __always_inline bool tif_need_resched(void)
+{
+ return arch_test_bit(TIF_NEED_RESCHED,
+ (unsigned long *)(&current_thread_info()->flags));
+}
+
+#else
+
+static __always_inline bool tif_need_resched(void)
+{
+ return test_bit(TIF_NEED_RESCHED,
+ (unsigned long *)(&current_thread_info()->flags));
+}
+
+#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
static inline int arch_within_stack_frames(const void * const stack,
diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index c303f7a114e9..d48cd92d2364 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -135,6 +135,21 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip);
# define do_ftrace_record_recursion(ip, pip) do { } while (0)
#endif
+#ifdef CONFIG_ARCH_WANTS_NO_INSTR
+# define trace_warn_on_no_rcu(ip) \
+ ({ \
+ bool __ret = !rcu_is_watching(); \
+ if (__ret && !trace_recursion_test(TRACE_RECORD_RECURSION_BIT)) { \
+ trace_recursion_set(TRACE_RECORD_RECURSION_BIT); \
+ WARN_ONCE(true, "RCU not on for: %pS\n", (void *)ip); \
+ trace_recursion_clear(TRACE_RECORD_RECURSION_BIT); \
+ } \
+ __ret; \
+ })
+#else
+# define trace_warn_on_no_rcu(ip) false
+#endif
+
/*
* Preemption is promised to be disabled when return bit >= 0.
*/
@@ -144,6 +159,9 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign
unsigned int val = READ_ONCE(current->trace_recursion);
int bit;
+ if (trace_warn_on_no_rcu(ip))
+ return -1;
+
bit = trace_get_context_bit() + start;
if (unlikely(val & (1 << bit))) {
/*
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 4b33b95eb8be..552f80b8362f 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -178,6 +178,17 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
#endif /* CONFIG_HAVE_STATIC_CALL */
/*
+ * ARCH_WANTS_NO_INSTR archs are expected to have sanitized entry and idle
+ * code that disallow any/all tracing/instrumentation when RCU isn't watching.
+ */
+#ifdef CONFIG_ARCH_WANTS_NO_INSTR
+#define RCUIDLE_COND(rcuidle) (rcuidle)
+#else
+/* srcu can't be used from NMI */
+#define RCUIDLE_COND(rcuidle) (rcuidle && in_nmi())
+#endif
+
+/*
* it_func[0] is never NULL because there is at least one element in the array
* when the array itself is non NULL.
*/
@@ -188,8 +199,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
if (!(cond)) \
return; \
\
- /* srcu can't be used from NMI */ \
- WARN_ON_ONCE(rcuidle && in_nmi()); \
+ if (WARN_ON_ONCE(RCUIDLE_COND(rcuidle))) \
+ return; \
\
/* keep srcu and sched-rcu usage consistent */ \
preempt_disable_notrace(); \
diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h
index a04a64bc1a00..823b47d1ba1e 100644
--- a/include/trace/events/rseq.h
+++ b/include/trace/events/rseq.h
@@ -16,13 +16,18 @@ TRACE_EVENT(rseq_update,
TP_STRUCT__entry(
__field(s32, cpu_id)
+ __field(s32, node_id)
+ __field(s32, mm_cid)
),
TP_fast_assign(
__entry->cpu_id = raw_smp_processor_id();
+ __entry->node_id = cpu_to_node(__entry->cpu_id);
+ __entry->mm_cid = task_mm_cid(t);
),
- TP_printk("cpu_id=%d", __entry->cpu_id)
+ TP_printk("cpu_id=%d node_id=%d mm_cid=%d", __entry->cpu_id,
+ __entry->node_id, __entry->mm_cid)
);
TRACE_EVENT(rseq_ip_fixup,
diff --git a/include/uapi/linux/auxvec.h b/include/uapi/linux/auxvec.h
index c7e502bf5a6f..6991c4b8ab18 100644
--- a/include/uapi/linux/auxvec.h
+++ b/include/uapi/linux/auxvec.h
@@ -30,6 +30,8 @@
* differ from AT_PLATFORM. */
#define AT_RANDOM 25 /* address of 16 random bytes */
#define AT_HWCAP2 26 /* extension of AT_HWCAP */
+#define AT_RSEQ_FEATURE_SIZE 27 /* rseq supported feature size */
+#define AT_RSEQ_ALIGN 28 /* rseq allocation alignment */
#define AT_EXECFN 31 /* filename of program */
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index 737605897f36..5f3ad6d5be6f 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -137,6 +137,9 @@
* @MEMBARRIER_CMD_SHARED:
* Alias to MEMBARRIER_CMD_GLOBAL. Provided for
* header backward compatibility.
+ * @MEMBARRIER_CMD_GET_REGISTRATIONS:
+ * Returns a bitmask of previously issued
+ * registration commands.
*
* Command to be passed to the membarrier system call. The commands need to
* be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
@@ -153,6 +156,7 @@ enum membarrier_cmd {
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6),
MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ = (1 << 7),
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ = (1 << 8),
+ MEMBARRIER_CMD_GET_REGISTRATIONS = (1 << 9),
/* Alias for header backward compatibility. */
MEMBARRIER_CMD_SHARED = MEMBARRIER_CMD_GLOBAL,
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index 77ee207623a9..c233aae5eac9 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -130,6 +130,28 @@ struct rseq {
* this thread.
*/
__u32 flags;
+
+ /*
+ * Restartable sequences node_id field. Updated by the kernel. Read by
+ * user-space with single-copy atomicity semantics. This field should
+ * only be read by the thread which registered this data structure.
+ * Aligned on 32-bit. Contains the current NUMA node ID.
+ */
+ __u32 node_id;
+
+ /*
+ * Restartable sequences mm_cid field. Updated by the kernel. Read by
+ * user-space with single-copy atomicity semantics. This field should
+ * only be read by the thread which registered this data structure.
+ * Aligned on 32-bit. Contains the current thread's concurrency ID
+ * (allocated uniquely within a memory map).
+ */
+ __u32 mm_cid;
+
+ /*
+ * Flexible array member at end of structure, after last feature field.
+ */
+ char end[];
} __attribute__((aligned(4 * sizeof(__u64))));
#endif /* _UAPI_LINUX_RSEQ_H */