summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-06-27 14:03:21 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-06-27 14:03:21 -0700
commited3b7923a816ded62dccef377c9ee346c7d3b1b4 (patch)
tree41d46fc399c231088a370e7b3e488a93342fa681
parente8f75c0270d930ef675fee22d74d1a3250e96962 (diff)
parentebb83d84e49b54369b0db67136a5fe1087124dcc (diff)
Merge tag 'sched-core-2023-06-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "Scheduler SMP load-balancer improvements: - Avoid unnecessary migrations within SMT domains on hybrid systems. Problem: On hybrid CPU systems, (processors with a mixture of higher-frequency SMT cores and lower-frequency non-SMT cores), under the old code lower-priority CPUs pulled tasks from the higher-priority cores if more than one SMT sibling was busy - resulting in many unnecessary task migrations. Solution: The new code improves the load balancer to recognize SMT cores with more than one busy sibling and allows lower-priority CPUs to pull tasks, which avoids superfluous migrations and lets lower-priority cores inspect all SMT siblings for the busiest queue. - Implement the 'runnable boosting' feature in the EAS balancer: consider CPU contention in frequency, EAS max util & load-balance busiest CPU selection. This improves CPU utilization for certain workloads, while leaves other key workloads unchanged. Scheduler infrastructure improvements: - Rewrite the scheduler topology setup code by consolidating it into the build_sched_topology() helper function and building it dynamically on the fly. - Resolve the local_clock() vs. noinstr complications by rewriting the code: provide separate sched_clock_noinstr() and local_clock_noinstr() functions to be used in instrumentation code, and make sure it is all instrumentation-safe. Fixes: - Fix a kthread_park() race with wait_woken() - Fix misc wait_task_inactive() bugs unearthed by the -rt merge: - Fix UP PREEMPT bug by unifying the SMP and UP implementations - Fix task_struct::saved_state handling - Fix various rq clock update bugs, unearthed by turning on the rq clock debugging code. - Fix the PSI WINDOW_MIN_US trigger limit, which was easy to trigger by creating enough cgroups, by removing the warnign and restricting window size triggers to PSI file write-permission or CAP_SYS_RESOURCE. - Propagate SMT flags in the topology when removing degenerate domain - Fix grub_reclaim() calculation bug in the deadline scheduler code - Avoid resetting the min update period when it is unnecessary, in psi_trigger_destroy(). - Don't balance a task to its current running CPU in load_balance(), which was possible on certain NUMA topologies with overlapping groups. - Fix the sched-debug printing of rq->nr_uninterruptible Cleanups: - Address various -Wmissing-prototype warnings, as a preparation to (maybe) enable this warning in the future. - Remove unused code - Mark more functions __init - Fix shadow-variable warnings" * tag 'sched-core-2023-06-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (50 commits) sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle() sched/core: Avoid double calling update_rq_clock() in __balance_push_cpu_stop() sched/core: Fixed missing rq clock update before calling set_rq_offline() sched/deadline: Update GRUB description in the documentation sched/deadline: Fix bandwidth reclaim equation in GRUB sched/wait: Fix a kthread_park race with wait_woken() sched/topology: Mark set_sched_topology() __init sched/fair: Rename variable cpu_util eff_util arm64/arch_timer: Fix MMIO byteswap sched/fair, cpufreq: Introduce 'runnable boosting' sched/fair: Refactor CPU utilization functions cpuidle: Use local_clock_noinstr() sched/clock: Provide local_clock_noinstr() x86/tsc: Provide sched_clock_noinstr() clocksource: hyper-v: Provide noinstr sched_clock() clocksource: hyper-v: Adjust hv_read_tsc_page_tsc() to avoid special casing U64_MAX x86/vdso: Fix gettimeofday masking math64: Always inline u128 version of mul_u64_u64_shr() s390/time: Provide sched_clock_noinstr() loongarch: Provide noinstr sched_clock_read() ...
-rw-r--r--Documentation/scheduler/sched-deadline.rst5
-rw-r--r--arch/arm64/include/asm/arch_timer.h8
-rw-r--r--arch/arm64/include/asm/io.h12
-rw-r--r--arch/loongarch/include/asm/loongarch.h2
-rw-r--r--arch/loongarch/kernel/time.c6
-rw-r--r--arch/s390/include/asm/timex.h13
-rw-r--r--arch/s390/kernel/time.c5
-rw-r--r--arch/x86/include/asm/mshyperv.h5
-rw-r--r--arch/x86/include/asm/vdso/gettimeofday.h41
-rw-r--r--arch/x86/kernel/itmt.c23
-rw-r--r--arch/x86/kernel/kvmclock.c4
-rw-r--r--arch/x86/kernel/smpboot.c96
-rw-r--r--arch/x86/kernel/tsc.c38
-rw-r--r--arch/x86/kvm/x86.c7
-rw-r--r--arch/x86/xen/time.c3
-rw-r--r--drivers/clocksource/arm_arch_timer.c54
-rw-r--r--drivers/clocksource/hyperv_timer.c42
-rw-r--r--drivers/cpuidle/cpuidle.c8
-rw-r--r--drivers/cpuidle/poll_state.c4
-rw-r--r--include/clocksource/hyperv_timer.h24
-rw-r--r--include/linux/kthread.h1
-rw-r--r--include/linux/math64.h2
-rw-r--r--include/linux/rbtree_latch.h2
-rw-r--r--include/linux/sched.h7
-rw-r--r--include/linux/sched/clock.h17
-rw-r--r--include/linux/sched/sd_flags.h5
-rw-r--r--include/linux/sched/topology.h2
-rw-r--r--include/linux/seqlock.h15
-rw-r--r--kernel/cgroup/cgroup.c12
-rw-r--r--kernel/kthread.c10
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/sched/clock.c19
-rw-r--r--kernel/sched/core.c278
-rw-r--r--kernel/sched/cpufreq_schedutil.c3
-rw-r--r--kernel/sched/deadline.c57
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/fair.c329
-rw-r--r--kernel/sched/psi.c19
-rw-r--r--kernel/sched/sched.h105
-rw-r--r--kernel/sched/topology.c15
-rw-r--r--kernel/sched/wait.c7
-rw-r--r--kernel/time/sched_clock.c24
-rw-r--r--kernel/time/timekeeping.c4
43 files changed, 773 insertions, 564 deletions
diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst
index 9d9be52f221a..9fe4846079bb 100644
--- a/Documentation/scheduler/sched-deadline.rst
+++ b/Documentation/scheduler/sched-deadline.rst
@@ -203,12 +203,15 @@ Deadline Task Scheduling
- Total bandwidth (this_bw): this is the sum of all tasks "belonging" to the
runqueue, including the tasks in Inactive state.
+ - Maximum usable bandwidth (max_bw): This is the maximum bandwidth usable by
+ deadline tasks and is currently set to the RT capacity.
+
The algorithm reclaims the bandwidth of the tasks in Inactive state.
It does so by decrementing the runtime of the executing task Ti at a pace equal
to
- dq = -max{ Ui / Umax, (1 - Uinact - Uextra) } dt
+ dq = -(max{ Ui, (Umax - Uinact - Uextra) } / Umax) dt
where:
diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h
index af1fafbe7e1d..934c658ee947 100644
--- a/arch/arm64/include/asm/arch_timer.h
+++ b/arch/arm64/include/asm/arch_timer.h
@@ -88,13 +88,7 @@ static inline notrace u64 arch_timer_read_cntvct_el0(void)
#define arch_timer_reg_read_stable(reg) \
({ \
- u64 _val; \
- \
- preempt_disable_notrace(); \
- _val = erratum_handler(read_ ## reg)(); \
- preempt_enable_notrace(); \
- \
- _val; \
+ erratum_handler(read_ ## reg)(); \
})
/*
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 877495a0fd0c..51d92abf945e 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -22,13 +22,13 @@
* Generic IO read/write. These perform native-endian accesses.
*/
#define __raw_writeb __raw_writeb
-static inline void __raw_writeb(u8 val, volatile void __iomem *addr)
+static __always_inline void __raw_writeb(u8 val, volatile void __iomem *addr)
{
asm volatile("strb %w0, [%1]" : : "rZ" (val), "r" (addr));
}
#define __raw_writew __raw_writew
-static inline void __raw_writew(u16 val, volatile void __iomem *addr)
+static __always_inline void __raw_writew(u16 val, volatile void __iomem *addr)
{
asm volatile("strh %w0, [%1]" : : "rZ" (val), "r" (addr));
}
@@ -40,13 +40,13 @@ static __always_inline void __raw_writel(u32 val, volatile void __iomem *addr)
}
#define __raw_writeq __raw_writeq
-static inline void __raw_writeq(u64 val, volatile void __iomem *addr)
+static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
{
asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
}
#define __raw_readb __raw_readb
-static inline u8 __raw_readb(const volatile void __iomem *addr)
+static __always_inline u8 __raw_readb(const volatile void __iomem *addr)
{
u8 val;
asm volatile(ALTERNATIVE("ldrb %w0, [%1]",
@@ -57,7 +57,7 @@ static inline u8 __raw_readb(const volatile void __iomem *addr)
}
#define __raw_readw __raw_readw
-static inline u16 __raw_readw(const volatile void __iomem *addr)
+static __always_inline u16 __raw_readw(const volatile void __iomem *addr)
{
u16 val;
@@ -80,7 +80,7 @@ static __always_inline u32 __raw_readl(const volatile void __iomem *addr)
}
#define __raw_readq __raw_readq
-static inline u64 __raw_readq(const volatile void __iomem *addr)
+static __always_inline u64 __raw_readq(const volatile void __iomem *addr)
{
u64 val;
asm volatile(ALTERNATIVE("ldr %0, [%1]",
diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h
index 35e8a52fea11..1c2a0a2c8830 100644
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -1167,7 +1167,7 @@ static __always_inline void iocsr_write64(u64 val, u32 reg)
#ifndef __ASSEMBLY__
-static inline u64 drdtime(void)
+static __always_inline u64 drdtime(void)
{
int rID = 0;
u64 val = 0;
diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c
index f377e50f3c66..c189e03cd5da 100644
--- a/arch/loongarch/kernel/time.c
+++ b/arch/loongarch/kernel/time.c
@@ -190,9 +190,9 @@ static u64 read_const_counter(struct clocksource *clk)
return drdtime();
}
-static u64 native_sched_clock(void)
+static noinstr u64 sched_clock_read(void)
{
- return read_const_counter(NULL);
+ return drdtime();
}
static struct clocksource clocksource_const = {
@@ -211,7 +211,7 @@ int __init constant_clocksource_init(void)
res = clocksource_register_hz(&clocksource_const, freq);
- sched_clock_register(native_sched_clock, 64, freq);
+ sched_clock_register(sched_clock_read, 64, freq);
pr_info("Constant clock source device register\n");
diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h
index ce878e85b6e4..4d646659a5f5 100644
--- a/arch/s390/include/asm/timex.h
+++ b/arch/s390/include/asm/timex.h
@@ -63,7 +63,7 @@ static inline int store_tod_clock_ext_cc(union tod_clock *clk)
return cc;
}
-static inline void store_tod_clock_ext(union tod_clock *tod)
+static __always_inline void store_tod_clock_ext(union tod_clock *tod)
{
asm volatile("stcke %0" : "=Q" (*tod) : : "cc");
}
@@ -177,7 +177,7 @@ static inline void local_tick_enable(unsigned long comp)
typedef unsigned long cycles_t;
-static inline unsigned long get_tod_clock(void)
+static __always_inline unsigned long get_tod_clock(void)
{
union tod_clock clk;
@@ -204,6 +204,11 @@ void init_cpu_timer(void);
extern union tod_clock tod_clock_base;
+static __always_inline unsigned long __get_tod_clock_monotonic(void)
+{
+ return get_tod_clock() - tod_clock_base.tod;
+}
+
/**
* get_clock_monotonic - returns current time in clock rate units
*
@@ -216,7 +221,7 @@ static inline unsigned long get_tod_clock_monotonic(void)
unsigned long tod;
preempt_disable_notrace();
- tod = get_tod_clock() - tod_clock_base.tod;
+ tod = __get_tod_clock_monotonic();
preempt_enable_notrace();
return tod;
}
@@ -240,7 +245,7 @@ static inline unsigned long get_tod_clock_monotonic(void)
* -> ns = (th * 125) + ((tl * 125) >> 9);
*
*/
-static inline unsigned long tod_to_ns(unsigned long todval)
+static __always_inline unsigned long tod_to_ns(unsigned long todval)
{
return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9);
}
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 6b7b6d5e3632..276278199c44 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -102,6 +102,11 @@ void __init time_early_init(void)
((long) qui.old_leap * 4096000000L);
}
+unsigned long long noinstr sched_clock_noinstr(void)
+{
+ return tod_to_ns(__get_tod_clock_monotonic());
+}
+
/*
* Scheduler clock - returns current time in nanosec units.
*/
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 49bb4f2bd300..88d9ef98e087 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -257,6 +257,11 @@ void hv_set_register(unsigned int reg, u64 value);
u64 hv_get_non_nested_register(unsigned int reg);
void hv_set_non_nested_register(unsigned int reg, u64 value);
+static __always_inline u64 hv_raw_get_register(unsigned int reg)
+{
+ return __rdmsr(reg);
+}
+
#else /* CONFIG_HYPERV */
static inline void hyperv_init(void) {}
static inline void hyperv_setup_mmu_ops(void) {}
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
index 4cf6794f9d68..c81858d903dc 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -231,14 +231,19 @@ static u64 vread_pvclock(void)
ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
} while (pvclock_read_retry(pvti, version));
- return ret;
+ return ret & S64_MAX;
}
#endif
#ifdef CONFIG_HYPERV_TIMER
static u64 vread_hvclock(void)
{
- return hv_read_tsc_page(&hvclock_page);
+ u64 tsc, time;
+
+ if (hv_read_tsc_page_tsc(&hvclock_page, &tsc, &time))
+ return time & S64_MAX;
+
+ return U64_MAX;
}
#endif
@@ -246,7 +251,7 @@ static inline u64 __arch_get_hw_counter(s32 clock_mode,
const struct vdso_data *vd)
{
if (likely(clock_mode == VDSO_CLOCKMODE_TSC))
- return (u64)rdtsc_ordered();
+ return (u64)rdtsc_ordered() & S64_MAX;
/*
* For any memory-mapped vclock type, we need to make sure that gcc
* doesn't cleverly hoist a load before the mode check. Otherwise we
@@ -284,6 +289,9 @@ static inline bool arch_vdso_clocksource_ok(const struct vdso_data *vd)
* which can be invalidated asynchronously and indicate invalidation by
* returning U64_MAX, which can be effectively tested by checking for a
* negative value after casting it to s64.
+ *
+ * This effectively forces a S64_MAX mask on the calculations, unlike the
+ * U64_MAX mask normally used by x86 clocksources.
*/
static inline bool arch_vdso_cycles_ok(u64 cycles)
{
@@ -303,18 +311,29 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
* @last. If not then use @last, which is the base time of the current
* conversion period.
*
- * This variant also removes the masking of the subtraction because the
- * clocksource mask of all VDSO capable clocksources on x86 is U64_MAX
- * which would result in a pointless operation. The compiler cannot
- * optimize it away as the mask comes from the vdso data and is not compile
- * time constant.
+ * This variant also uses a custom mask because while the clocksource mask of
+ * all the VDSO capable clocksources on x86 is U64_MAX, the above code uses
+ * U64_MASK as an exception value, additionally arch_vdso_cycles_ok() above
+ * declares everything with the MSB/Sign-bit set as invalid. Therefore the
+ * effective mask is S64_MAX.
*/
static __always_inline
u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
{
- if (cycles > last)
- return (cycles - last) * mult;
- return 0;
+ /*
+ * Due to the MSB/Sign-bit being used as invald marker (see
+ * arch_vdso_cycles_valid() above), the effective mask is S64_MAX.
+ */
+ u64 delta = (cycles - last) & S64_MAX;
+
+ /*
+ * Due to the above mentioned TSC wobbles, filter out negative motion.
+ * Per the above masking, the effective sign bit is now bit 62.
+ */
+ if (unlikely(delta & (1ULL << 62)))
+ return 0;
+
+ return delta * mult;
}
#define vdso_calc_delta vdso_calc_delta
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 670eb08b972a..ee4fe8cdb857 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -165,32 +165,19 @@ int arch_asym_cpu_priority(int cpu)
/**
* sched_set_itmt_core_prio() - Set CPU priority based on ITMT
- * @prio: Priority of cpu core
- * @core_cpu: The cpu number associated with the core
+ * @prio: Priority of @cpu
+ * @cpu: The CPU number
*
* The pstate driver will find out the max boost frequency
* and call this function to set a priority proportional
- * to the max boost frequency. CPU with higher boost
+ * to the max boost frequency. CPUs with higher boost
* frequency will receive higher priority.
*
* No need to rebuild sched domain after updating
* the CPU priorities. The sched domains have no
* dependency on CPU priorities.
*/
-void sched_set_itmt_core_prio(int prio, int core_cpu)
+void sched_set_itmt_core_prio(int prio, int cpu)
{
- int cpu, i = 1;
-
- for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
- int smt_prio;
-
- /*
- * Ensure that the siblings are moved to the end
- * of the priority chain and only used when
- * all other high priority cpus are out of capacity.
- */
- smt_prio = prio * smp_num_siblings / (i * i);
- per_cpu(sched_core_priority, cpu) = smt_prio;
- i++;
- }
+ per_cpu(sched_core_priority, cpu) = prio;
}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 0f35d44c56fe..fb8f52149be9 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -71,7 +71,7 @@ static int kvm_set_wallclock(const struct timespec64 *now)
return -ENODEV;
}
-static noinstr u64 kvm_clock_read(void)
+static u64 kvm_clock_read(void)
{
u64 ret;
@@ -88,7 +88,7 @@ static u64 kvm_clock_get_cycles(struct clocksource *cs)
static noinstr u64 kvm_sched_clock_read(void)
{
- return kvm_clock_read() - kvm_sched_clock_offset;
+ return pvclock_clocksource_read_nowd(this_cpu_pvti()) - kvm_sched_clock_offset;
}
static inline void kvm_sched_clock_init(bool stable)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8779a7ed3e87..ed2d51960a7d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -602,7 +602,7 @@ static int x86_core_flags(void)
#ifdef CONFIG_SCHED_SMT
static int x86_smt_flags(void)
{
- return cpu_smt_flags() | x86_sched_itmt_flags();
+ return cpu_smt_flags();
}
#endif
#ifdef CONFIG_SCHED_CLUSTER
@@ -613,50 +613,57 @@ static int x86_cluster_flags(void)
#endif
#endif
-static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
-#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-#ifdef CONFIG_SCHED_CLUSTER
- { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
-#endif
-#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
-#endif
- { NULL, },
-};
+/*
+ * Set if a package/die has multiple NUMA nodes inside.
+ * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
+ * Sub-NUMA Clustering have this.
+ */
+static bool x86_has_numa_in_package;
-static struct sched_domain_topology_level x86_hybrid_topology[] = {
-#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
-#endif
- { cpu_cpu_mask, SD_INIT_NAME(DIE) },
- { NULL, },
-};
+static struct sched_domain_topology_level x86_topology[6];
+
+static void __init build_sched_topology(void)
+{
+ int i = 0;
-static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
+ x86_topology[i++] = (struct sched_domain_topology_level){
+ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT)
+ };
#endif
#ifdef CONFIG_SCHED_CLUSTER
- { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
+ /*
+ * For now, skip the cluster domain on Hybrid.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
+ x86_topology[i++] = (struct sched_domain_topology_level){
+ cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
+ };
+ }
#endif
#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
+ x86_topology[i++] = (struct sched_domain_topology_level){
+ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC)
+ };
#endif
- { cpu_cpu_mask, SD_INIT_NAME(DIE) },
- { NULL, },
-};
+ /*
+ * When there is NUMA topology inside the package skip the DIE domain
+ * since the NUMA domains will auto-magically create the right spanning
+ * domains based on the SLIT.
+ */
+ if (!x86_has_numa_in_package) {
+ x86_topology[i++] = (struct sched_domain_topology_level){
+ cpu_cpu_mask, SD_INIT_NAME(DIE)
+ };
+ }
-/*
- * Set if a package/die has multiple NUMA nodes inside.
- * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
- * Sub-NUMA Clustering have this.
- */
-static bool x86_has_numa_in_package;
+ /*
+ * There must be one trailing NULL entry left.
+ */
+ BUG_ON(i >= ARRAY_SIZE(x86_topology)-1);
+
+ set_sched_topology(x86_topology);
+}
void set_cpu_sibling_map(int cpu)
{
@@ -1264,15 +1271,6 @@ void __init smp_prepare_cpus_common(void)
zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
}
- /*
- * Set 'default' x86 topology, this matches default_topology() in that
- * it has NUMA nodes as a topology level. See also
- * native_smp_cpus_done().
- *
- * Must be done before set_cpus_sibling_map() is ran.
- */
- set_sched_topology(x86_topology);
-
set_cpu_sibling_map(0);
}
@@ -1393,13 +1391,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
pr_debug("Boot done\n");
calculate_max_logical_packages();
-
- /* XXX for now assume numa-in-package and hybrid don't overlap */
- if (x86_has_numa_in_package)
- set_sched_topology(x86_numa_in_package_topology);
- if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
- set_sched_topology(x86_hybrid_topology);
-
+ build_sched_topology();
nmi_selftest();
impress_friends();
cache_aps_init();
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 1412b771651e..3425c6a943e4 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -69,12 +69,10 @@ static int __init tsc_early_khz_setup(char *buf)
}
early_param("tsc_early_khz", tsc_early_khz_setup);
-__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
+__always_inline void __cyc2ns_read(struct cyc2ns_data *data)
{
int seq, idx;
- preempt_disable_notrace();
-
do {
seq = this_cpu_read(cyc2ns.seq.seqcount.sequence);
idx = seq & 1;
@@ -86,6 +84,12 @@ __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
} while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence)));
}
+__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
+{
+ preempt_disable_notrace();
+ __cyc2ns_read(data);
+}
+
__always_inline void cyc2ns_read_end(void)
{
preempt_enable_notrace();
@@ -115,18 +119,25 @@ __always_inline void cyc2ns_read_end(void)
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*/
-static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
+static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc)
{
struct cyc2ns_data data;
unsigned long long ns;
- cyc2ns_read_begin(&data);
+ __cyc2ns_read(&data);
ns = data.cyc2ns_offset;
ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
- cyc2ns_read_end();
+ return ns;
+}
+static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+ unsigned long long ns;
+ preempt_disable_notrace();
+ ns = __cycles_2_ns(cyc);
+ preempt_enable_notrace();
return ns;
}
@@ -223,7 +234,7 @@ noinstr u64 native_sched_clock(void)
u64 tsc_now = rdtsc();
/* return the value in ns */
- return cycles_2_ns(tsc_now);
+ return __cycles_2_ns(tsc_now);
}
/*
@@ -250,7 +261,7 @@ u64 native_sched_clock_from_tsc(u64 tsc)
/* We need to define a real function for sched_clock, to override the
weak default version */
#ifdef CONFIG_PARAVIRT
-noinstr u64 sched_clock(void)
+noinstr u64 sched_clock_noinstr(void)
{
return paravirt_sched_clock();
}
@@ -260,11 +271,20 @@ bool using_native_sched_clock(void)
return static_call_query(pv_sched_clock) == native_sched_clock;
}
#else
-u64 sched_clock(void) __attribute__((alias("native_sched_clock")));
+u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));
bool using_native_sched_clock(void) { return true; }
#endif
+notrace u64 sched_clock(void)
+{
+ u64 now;
+ preempt_disable_notrace();
+ now = sched_clock_noinstr();
+ preempt_enable_notrace();
+ return now;
+}
+
int check_tsc_unstable(void)
{
return tsc_unstable;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 04b57a336b34..bc68a39efd70 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2799,14 +2799,13 @@ static u64 read_tsc(void)
static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
int *mode)
{
- long v;
u64 tsc_pg_val;
+ long v;
switch (clock->vclock_mode) {
case VDSO_CLOCKMODE_HVCLOCK:
- tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
- tsc_timestamp);
- if (tsc_pg_val != U64_MAX) {
+ if (hv_read_tsc_page_tsc(hv_get_tsc_page(),
+ tsc_timestamp, &tsc_pg_val)) {
/* TSC page valid */
*mode = VDSO_CLOCKMODE_HVCLOCK;
v = (tsc_pg_val - clock->cycle_last) &
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index b74ac2562cfb..52fa5609b7f6 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -66,11 +66,10 @@ static noinstr u64 xen_sched_clock(void)
struct pvclock_vcpu_time_info *src;
u64 ret;
- preempt_disable_notrace();
src = &__this_cpu_read(xen_vcpu)->time;
ret = pvclock_clocksource_read_nowd(src);
ret -= xen_sched_clock_offset;
- preempt_enable_notrace();
+
return ret;
}
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index e09d4427f604..e733a2a1927a 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -191,22 +191,40 @@ u32 arch_timer_reg_read(int access, enum arch_timer_reg reg,
return val;
}
-static notrace u64 arch_counter_get_cntpct_stable(void)
+static noinstr u64 raw_counter_get_cntpct_stable(void)
{
return __arch_counter_get_cntpct_stable();
}
-static notrace u64 arch_counter_get_cntpct(void)
+static notrace u64 arch_counter_get_cntpct_stable(void)
+{
+ u64 val;
+ preempt_disable_notrace();
+ val = __arch_counter_get_cntpct_stable();
+ preempt_enable_notrace();
+ return val;
+}
+
+static noinstr u64 arch_counter_get_cntpct(void)
{
return __arch_counter_get_cntpct();
}
-static notrace u64 arch_counter_get_cntvct_stable(void)
+static noinstr u64 raw_counter_get_cntvct_stable(void)
{
return __arch_counter_get_cntvct_stable();
}
-static notrace u64 arch_counter_get_cntvct(void)
+static notrace u64 arch_counter_get_cntvct_stable(void)
+{
+ u64 val;
+ preempt_disable_notrace();
+ val = __arch_counter_get_cntvct_stable();
+ preempt_enable_notrace();
+ return val;
+}
+
+static noinstr u64 arch_counter_get_cntvct(void)
{
return __arch_counter_get_cntvct();
}
@@ -753,14 +771,14 @@ static int arch_timer_set_next_event_phys(unsigned long evt,
return 0;
}
-static u64 arch_counter_get_cnt_mem(struct arch_timer *t, int offset_lo)
+static noinstr u64 arch_counter_get_cnt_mem(struct arch_timer *t, int offset_lo)
{
u32 cnt_lo, cnt_hi, tmp_hi;
do {
- cnt_hi = readl_relaxed(t->base + offset_lo + 4);
- cnt_lo = readl_relaxed(t->base + offset_lo);
- tmp_hi = readl_relaxed(t->base + offset_lo + 4);
+ cnt_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4));
+ cnt_lo = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo));
+ tmp_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4));
} while (cnt_hi != tmp_hi);
return ((u64) cnt_hi << 32) | cnt_lo;
@@ -1060,7 +1078,7 @@ bool arch_timer_evtstrm_available(void)
return cpumask_test_cpu(raw_smp_processor_id(), &evtstrm_available);
}
-static u64 arch_counter_get_cntvct_mem(void)
+static noinstr u64 arch_counter_get_cntvct_mem(void)
{
return arch_counter_get_cnt_mem(arch_timer_mem, CNTVCT_LO);
}
@@ -1074,6 +1092,7 @@ struct arch_timer_kvm_info *arch_timer_get_kvm_info(void)
static void __init arch_counter_register(unsigned type)
{
+ u64 (*scr)(void);
u64 start_count;
int width;
@@ -1083,21 +1102,28 @@ static void __init arch_counter_register(unsigned type)
if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) ||
arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) {
- if (arch_timer_counter_has_wa())
+ if (arch_timer_counter_has_wa()) {
rd = arch_counter_get_cntvct_stable;
- else
+ scr = raw_counter_get_cntvct_stable;
+ } else {
rd = arch_counter_get_cntvct;
+ scr = arch_counter_get_cntvct;
+ }
} else {
- if (arch_timer_counter_has_wa())
+ if (arch_timer_counter_has_wa()) {
rd = arch_counter_get_cntpct_stable;
- else
+ scr = raw_counter_get_cntpct_stable;
+ } else {
rd = arch_counter_get_cntpct;
+ scr = arch_counter_get_cntpct;
+ }
}
arch_timer_read_counter = rd;
clocksource_counter.vdso_clock_mode = vdso_default;
} else {
arch_timer_read_counter = arch_counter_get_cntvct_mem;
+ scr = arch_counter_get_cntvct_mem;
}
width = arch_counter_get_width();
@@ -1113,7 +1139,7 @@ static void __init arch_counter_register(unsigned type)
timecounter_init(&arch_timer_kvm_info.timecounter,
&cyclecounter, start_count);
- sched_clock_register(arch_timer_read_counter, width, arch_timer_rate);
+ sched_clock_register(scr, width, arch_timer_rate);
}
static void arch_timer_stop(struct clock_event_device *clk)
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index 9fc008c16636..e56307a81f4d 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -365,6 +365,20 @@ void hv_stimer_global_cleanup(void)
}
EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup);
+static __always_inline u64 read_hv_clock_msr(void)
+{
+ /*
+ * Read the partition counter to get the current tick count. This count
+ * is set to 0 when the partition is created and is incremented in 100
+ * nanosecond units.
+ *
+ * Use hv_raw_get_register() because this function is used from
+ * noinstr. Notable; while HV_REGISTER_TIME_REF_COUNT is a synthetic
+ * register it doesn't need the GHCB path.
+ */
+ return hv_raw_get_register(HV_REGISTER_TIME_REF_COUNT);
+}
+
/*
* Code and definitions for the Hyper-V clocksources. Two
* clocksources are defined: one that reads the Hyper-V defined MSR, and
@@ -393,14 +407,20 @@ struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
}
EXPORT_SYMBOL_GPL(hv_get_tsc_page);
-static u64 notrace read_hv_clock_tsc(void)
+static __always_inline u64 read_hv_clock_tsc(void)
{
- u64 current_tick = hv_read_tsc_page(hv_get_tsc_page());
+ u64 cur_tsc, time;
- if (current_tick == U64_MAX)
- current_tick = hv_get_register(HV_REGISTER_TIME_REF_COUNT);
+ /*
+ * The Hyper-V Top-Level Function Spec (TLFS), section Timers,
+ * subsection Refererence Counter, guarantees that the TSC and MSR
+ * times are in sync and monotonic. Therefore we can fall back
+ * to the MSR in case the TSC page indicates unavailability.
+ */
+ if (!hv_read_tsc_page_tsc(tsc_page, &cur_tsc, &time))
+ time = read_hv_clock_msr();
- return current_tick;
+ return time;
}
static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg)
@@ -408,7 +428,7 @@ static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg)
return read_hv_clock_tsc();
}
-static u64 notrace read_hv_sched_clock_tsc(void)
+static u64 noinstr read_hv_sched_clock_tsc(void)
{
return (read_hv_clock_tsc() - hv_sched_clock_offset) *
(NSEC_PER_SEC / HV_CLOCK_HZ);
@@ -460,16 +480,6 @@ static struct clocksource hyperv_cs_tsc = {
#endif
};
-static u64 notrace read_hv_clock_msr(void)
-{
- /*
- * Read the partition counter to get the current tick count. This count
- * is set to 0 when the partition is created and is incremented in
- * 100 nanosecond units.
- */
- return hv_get_register(HV_REGISTER_TIME_REF_COUNT);
-}
-
static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg)
{
return read_hv_clock_msr();
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 8e929f6602ce..737a026ef58a 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -145,7 +145,7 @@ static noinstr void enter_s2idle_proper(struct cpuidle_driver *drv,
instrumentation_begin();
- time_start = ns_to_ktime(local_clock());
+ time_start = ns_to_ktime(local_clock_noinstr());
tick_freeze();
/*
@@ -169,7 +169,7 @@ static noinstr void enter_s2idle_proper(struct cpuidle_driver *drv,
tick_unfreeze();
start_critical_timings();
- time_end = ns_to_ktime(local_clock());
+ time_end = ns_to_ktime(local_clock_noinstr());
dev->states_usage[index].s2idle_time += ktime_us_delta(time_end, time_start);
dev->states_usage[index].s2idle_usage++;
@@ -243,7 +243,7 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev,
sched_idle_set_state(target_state);
trace_cpu_idle(index, dev->cpu);
- time_start = ns_to_ktime(local_clock());
+ time_start = ns_to_ktime(local_clock_noinstr());
stop_critical_timings();
if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) {
@@ -276,7 +276,7 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev,
start_critical_timings();
sched_clock_idle_wakeup_event();
- time_end = ns_to_ktime(local_clock());
+ time_end = ns_to_ktime(local_clock_noinstr());
trace_cpu_idle(PWR_EVENT_EXIT, dev->cpu);
/* The cpu is no longer idle or about to enter idle. */
diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c
index bdcfeaecd228..9b6d90a72601 100644
--- a/drivers/cpuidle/poll_state.c
+++ b/drivers/cpuidle/poll_state.c
@@ -15,7 +15,7 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev,
{
u64 time_start;
- time_start = local_clock();
+ time_start = local_clock_noinstr();
dev->poll_time_limit = false;
@@ -32,7 +32,7 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev,
continue;
loop_count = 0;
- if (local_clock() - time_start > limit) {
+ if (local_clock_noinstr() - time_start > limit) {
dev->poll_time_limit = true;
break;
}
diff --git a/include/clocksource/hyperv_timer.h b/include/clocksource/hyperv_timer.h
index 536f897375d0..6cdc873ac907 100644
--- a/include/clocksource/hyperv_timer.h
+++ b/include/clocksource/hyperv_timer.h
@@ -38,8 +38,9 @@ extern void hv_remap_tsc_clocksource(void);
extern unsigned long hv_get_tsc_pfn(void);
extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
-static inline notrace u64
-hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc)
+static __always_inline bool
+hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
+ u64 *cur_tsc, u64 *time)
{
u64 scale, offset;
u32 sequence;
@@ -63,7 +64,7 @@ hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc)
do {
sequence = READ_ONCE(tsc_pg->tsc_sequence);
if (!sequence)
- return U64_MAX;
+ return false;
/*
* Make sure we read sequence before we read other values from
* TSC page.
@@ -82,15 +83,8 @@ hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc)
} while (READ_ONCE(tsc_pg->tsc_sequence) != sequence);
- return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset;
-}
-
-static inline notrace u64
-hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
-{
- u64 cur_tsc;
-
- return hv_read_tsc_page_tsc(tsc_pg, &cur_tsc);
+ *time = mul_u64_u64_shr(*cur_tsc, scale, 64) + offset;
+ return true;
}
#else /* CONFIG_HYPERV_TIMER */
@@ -104,10 +98,10 @@ static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
return NULL;
}
-static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
- u64 *cur_tsc)
+static __always_inline bool
+hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc, u64 *time)
{
- return U64_MAX;
+ return false;
}
static inline int hv_stimer_cleanup(unsigned int cpu) { return 0; }
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 30e5bec81d2b..f1f95a71a4bc 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -89,6 +89,7 @@ int kthread_stop(struct task_struct *k);
bool kthread_should_stop(void);
bool kthread_should_park(void);
bool __kthread_should_park(struct task_struct *k);
+bool kthread_should_stop_or_park(void);
bool kthread_freezable_should_stop(bool *was_frozen);
void *kthread_func(struct task_struct *k);
void *kthread_data(struct task_struct *k);
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 8b9191a2849e..bf74478926d4 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -168,7 +168,7 @@ static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
#endif /* mul_u64_u32_shr */
#ifndef mul_u64_u64_shr
-static inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift)
+static __always_inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift)
{
return (u64)(((unsigned __int128)a * mul) >> shift);
}
diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h
index 3d1a9e716b80..6a0999c26c7c 100644
--- a/include/linux/rbtree_latch.h
+++ b/include/linux/rbtree_latch.h
@@ -206,7 +206,7 @@ latch_tree_find(void *key, struct latch_tree_root *root,
do {
seq = raw_read_seqcount_latch(&root->seq);
node = __lt_find(key, root, seq & 1, ops->comp);
- } while (read_seqcount_latch_retry(&root->seq, seq));
+ } while (raw_read_seqcount_latch_retry(&root->seq, seq));
return node;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index eed5d65b8d1f..1292d38d66cc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2006,15 +2006,12 @@ static __always_inline void scheduler_ipi(void)
*/
preempt_fold_need_resched();
}
-extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
#else
static inline void scheduler_ipi(void) { }
-static inline unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
-{
- return 1;
-}
#endif
+extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
+
/*
* Set thread flags in other task's structures.
* See asm/thread_info.h for TIF_xxxx flags available:
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index ca008f7d3615..196f0ca351a2 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -12,7 +12,16 @@
*
* Please use one of the three interfaces below.
*/
-extern unsigned long long notrace sched_clock(void);
+extern u64 sched_clock(void);
+
+#if defined(CONFIG_ARCH_WANTS_NO_INSTR) || defined(CONFIG_GENERIC_SCHED_CLOCK)
+extern u64 sched_clock_noinstr(void);
+#else
+static __always_inline u64 sched_clock_noinstr(void)
+{
+ return sched_clock();
+}
+#endif
/*
* See the comment in kernel/sched/clock.c
@@ -45,6 +54,11 @@ static inline u64 cpu_clock(int cpu)
return sched_clock();
}
+static __always_inline u64 local_clock_noinstr(void)
+{
+ return sched_clock_noinstr();
+}
+
static __always_inline u64 local_clock(void)
{
return sched_clock();
@@ -79,6 +93,7 @@ static inline u64 cpu_clock(int cpu)
return sched_clock_cpu(cpu);
}
+extern u64 local_clock_noinstr(void);
extern u64 local_clock(void);
#endif
diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index 57bde66d95f7..fad77b5172e2 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -132,12 +132,9 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
/*
* Place busy tasks earlier in the domain
*
- * SHARED_CHILD: Usually set on the SMT level. Technically could be set further
- * up, but currently assumed to be set from the base domain
- * upwards (see update_top_cache_domain()).
* NEEDS_GROUPS: Load balancing flag.
*/
-SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS)
/*
* Prefer to place tasks in a sibling domain
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 816df6cc444e..67b573d5bf28 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -203,7 +203,7 @@ struct sched_domain_topology_level {
#endif
};
-extern void set_sched_topology(struct sched_domain_topology_level *tl);
+extern void __init set_sched_topology(struct sched_domain_topology_level *tl);
#ifdef CONFIG_SCHED_DEBUG
# define SD_INIT_NAME(type) .name = #type
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 3926e9027947..987a59d977c5 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -671,9 +671,9 @@ typedef struct {
*
* Return: sequence counter raw value. Use the lowest bit as an index for
* picking which data copy to read. The full counter must then be checked
- * with read_seqcount_latch_retry().
+ * with raw_read_seqcount_latch_retry().
*/
-static inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
+static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
{
/*
* Pairs with the first smp_wmb() in raw_write_seqcount_latch().
@@ -683,16 +683,17 @@ static inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
}
/**
- * read_seqcount_latch_retry() - end a seqcount_latch_t read section
+ * raw_read_seqcount_latch_retry() - end a seqcount_latch_t read section
* @s: Pointer to seqcount_latch_t
* @start: count, from raw_read_seqcount_latch()
*
* Return: true if a read section retry is required, else false
*/
-static inline int
-read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
+static __always_inline int
+raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
{
- return read_seqcount_retry(&s->seqcount, start);
+ smp_rmb();
+ return unlikely(READ_ONCE(s->seqcount.sequence) != start);
}
/**
@@ -752,7 +753,7 @@ read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
* entry = data_query(latch->data[idx], ...);
*
* // This includes needed smp_rmb()
- * } while (read_seqcount_latch_retry(&latch->seq, seq));
+ * } while (raw_read_seqcount_latch_retry(&latch->seq, seq));
*
* return entry;
* }
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4d42f0cbc11e..8f917f682f52 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3891,6 +3891,14 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}
+static int cgroup_pressure_open(struct kernfs_open_file *of)
+{
+ if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ return 0;
+}
+
static void cgroup_pressure_release(struct kernfs_open_file *of)
{
struct cgroup_file_ctx *ctx = of->priv;
@@ -5290,6 +5298,7 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "io.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
+ .open = cgroup_pressure_open,
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5298,6 +5307,7 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "memory.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
+ .open = cgroup_pressure_open,
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5306,6 +5316,7 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "cpu.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
+ .open = cgroup_pressure_open,
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5315,6 +5326,7 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "irq.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
+ .open = cgroup_pressure_open,
.seq_show = cgroup_irq_pressure_show,
.write = cgroup_irq_pressure_write,
.poll = cgroup_pressure_poll,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 490792b1066e..07a057086d26 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -182,6 +182,16 @@ bool kthread_should_park(void)
}
EXPORT_SYMBOL_GPL(kthread_should_park);
+bool kthread_should_stop_or_park(void)
+{
+ struct kthread *kthread = __to_kthread(current);
+
+ if (!kthread)
+ return false;
+
+ return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK));
+}
+
/**
* kthread_freezable_should_stop - should this freezable kthread return now?
* @was_frozen: optional out parameter, indicates whether %current was frozen
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 6a333adce3b3..357a4d18f638 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -528,7 +528,7 @@ static u64 latched_seq_read_nolock(struct latched_seq *ls)
seq = raw_read_seqcount_latch(&ls->latch);
idx = seq & 0x1;
val = ls->val[idx];
- } while (read_seqcount_latch_retry(&ls->latch, seq));
+ } while (raw_read_seqcount_latch_retry(&ls->latch, seq));
return val;
}
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index b5cc2b53464d..5a575a0ba4e6 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -266,7 +266,7 @@ static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)
s64 delta;
again:
- now = sched_clock();
+ now = sched_clock_noinstr();
delta = now - scd->tick_raw;
if (unlikely(delta < 0))
delta = 0;
@@ -293,22 +293,29 @@ again:
return clock;
}
-noinstr u64 local_clock(void)
+noinstr u64 local_clock_noinstr(void)
{
u64 clock;
if (static_branch_likely(&__sched_clock_stable))
- return sched_clock() + __sched_clock_offset;
+ return sched_clock_noinstr() + __sched_clock_offset;
if (!static_branch_likely(&sched_clock_running))
- return sched_clock();
+ return sched_clock_noinstr();
- preempt_disable_notrace();
clock = sched_clock_local(this_scd());
- preempt_enable_notrace();
return clock;
}
+
+u64 local_clock(void)
+{
+ u64 now;
+ preempt_disable_notrace();
+ now = local_clock_noinstr();
+ preempt_enable_notrace();
+ return now;
+}
EXPORT_SYMBOL_GPL(local_clock);
static notrace u64 sched_clock_remote(struct sched_clock_data *scd)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a68d1276bab0..7eb6e2927390 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2213,6 +2213,154 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
rq_clock_skip_update(rq);
}
+static __always_inline
+int __task_state_match(struct task_struct *p, unsigned int state)
+{
+ if (READ_ONCE(p->__state) & state)
+ return 1;
+
+#ifdef CONFIG_PREEMPT_RT
+ if (READ_ONCE(p->saved_state) & state)
+ return -1;
+#endif
+ return 0;
+}
+
+static __always_inline
+int task_state_match(struct task_struct *p, unsigned int state)
+{
+#ifdef CONFIG_PREEMPT_RT
+ int match;
+
+ /*
+ * Serialize against current_save_and_set_rtlock_wait_state() and
+ * current_restore_rtlock_saved_state().
+ */
+ raw_spin_lock_irq(&p->pi_lock);
+ match = __task_state_match(p, state);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ return match;
+#else
+ return __task_state_match(p, state);
+#endif
+}
+
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * Wait for the thread to block in any of the states set in @match_state.
+ * If it changes, i.e. @p might have woken up, then return zero. When we
+ * succeed in waiting for @p to be off its CPU, we return a positive number
+ * (its total switch count). If a second call a short while later returns the
+ * same number, the caller can be sure that @p has remained unscheduled the
+ * whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
+{
+ int running, queued, match;
+ struct rq_flags rf;
+ unsigned long ncsw;
+ struct rq *rq;
+
+ for (;;) {
+ /*
+ * We do the initial early heuristics without holding
+ * any task-queue locks at all. We'll only try to get
+ * the runqueue lock when things look like they will
+ * work out!
+ */
+ rq = task_rq(p);
+
+ /*
+ * If the task is actively running on another CPU
+ * still, just relax and busy-wait without holding
+ * any locks.
+ *
+ * NOTE! Since we don't hold any locks, it's not
+ * even sure that "rq" stays as the right runqueue!
+ * But we don't care, since "task_on_cpu()" will
+ * return false if the runqueue has changed and p
+ * is actually now running somewhere else!
+ */
+ while (task_on_cpu(rq, p)) {
+ if (!task_state_match(p, match_state))
+ return 0;
+ cpu_relax();
+ }
+
+ /*
+ * Ok, time to look more closely! We need the rq
+ * lock now, to be *sure*. If we're wrong, we'll
+ * just go back and repeat.
+ */
+ rq = task_rq_lock(p, &rf);
+ trace_sched_wait_task(p);
+ running = task_on_cpu(rq, p);
+ queued = task_on_rq_queued(p);
+ ncsw = 0;
+ if ((match = __task_state_match(p, match_state))) {
+ /*
+ * When matching on p->saved_state, consider this task
+ * still queued so it will wait.
+ */
+ if (match < 0)
+ queued = 1;
+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+ }
+ task_rq_unlock(rq, p, &rf);
+
+ /*
+ * If it changed from the expected state, bail out now.
+ */
+ if (unlikely(!ncsw))
+ break;
+
+ /*
+ * Was it really running after all now that we
+ * checked with the proper locks actually held?
+ *
+ * Oops. Go back and try again..
+ */
+ if (unlikely(running)) {
+ cpu_relax();
+ continue;
+ }
+
+ /*
+ * It's not enough that it's not actively running,
+ * it must be off the runqueue _entirely_, and not
+ * preempted!
+ *
+ * So if it was still runnable (but just not actively
+ * running right now), it's preempted, and we should
+ * yield - it could be a while.
+ */
+ if (unlikely(queued)) {
+ ktime_t to = NSEC_PER_SEC / HZ;
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
+ continue;
+ }
+
+ /*
+ * Ahh, all good. It wasn't running, and it wasn't
+ * runnable, which means that it will never become
+ * running in the future either. We're all done!
+ */
+ break;
+ }
+
+ return ncsw;
+}
+
#ifdef CONFIG_SMP
static void
@@ -2398,7 +2546,6 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
if (!is_cpu_allowed(p, dest_cpu))
return rq;
- update_rq_clock(rq);
rq = move_queued_task(rq, rf, p, dest_cpu);
return rq;
@@ -2456,10 +2603,12 @@ static int migration_cpu_stop(void *data)
goto out;
}
- if (task_on_rq_queued(p))
+ if (task_on_rq_queued(p)) {
+ update_rq_clock(rq);
rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
- else
+ } else {
p->wake_cpu = arg->dest_cpu;
+ }
/*
* XXX __migrate_task() can fail, at which point we might end
@@ -3341,114 +3490,6 @@ out:
}
#endif /* CONFIG_NUMA_BALANCING */
-/*
- * wait_task_inactive - wait for a thread to unschedule.
- *
- * Wait for the thread to block in any of the states set in @match_state.
- * If it changes, i.e. @p might have woken up, then return zero. When we
- * succeed in waiting for @p to be off its CPU, we return a positive number
- * (its total switch count). If a second call a short while later returns the
- * same number, the caller can be sure that @p has remained unscheduled the
- * whole time.
- *
- * The caller must ensure that the task *will* unschedule sometime soon,
- * else this function might spin for a *long* time. This function can't
- * be called with interrupts off, or it may introduce deadlock with
- * smp_call_function() if an IPI is sent by the same process we are
- * waiting to become inactive.
- */
-unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
-{
- int running, queued;
- struct rq_flags rf;
- unsigned long ncsw;
- struct rq *rq;
-
- for (;;) {
- /*
- * We do the initial early heuristics without holding
- * any task-queue locks at all. We'll only try to get
- * the runqueue lock when things look like they will
- * work out!
- */
- rq = task_rq(p);
-
- /*
- * If the task is actively running on another CPU
- * still, just relax and busy-wait without holding
- * any locks.
- *
- * NOTE! Since we don't hold any locks, it's not
- * even sure that "rq" stays as the right runqueue!
- * But we don't care, since "task_on_cpu()" will
- * return false if the runqueue has changed and p
- * is actually now running somewhere else!
- */
- while (task_on_cpu(rq, p)) {
- if (!(READ_ONCE(p->__state) & match_state))
- return 0;
- cpu_relax();
- }
-
- /*
- * Ok, time to look more closely! We need the rq
- * lock now, to be *sure*. If we're wrong, we'll
- * just go back and repeat.
- */
- rq = task_rq_lock(p, &rf);
- trace_sched_wait_task(p);
- running = task_on_cpu(rq, p);
- queued = task_on_rq_queued(p);
- ncsw = 0;
- if (READ_ONCE(p->__state) & match_state)
- ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- task_rq_unlock(rq, p, &rf);
-
- /*
- * If it changed from the expected state, bail out now.
- */
- if (unlikely(!ncsw))
- break;
-
- /*
- * Was it really running after all now that we
- * checked with the proper locks actually held?
- *
- * Oops. Go back and try again..
- */
- if (unlikely(running)) {
- cpu_relax();
- continue;
- }
-
- /*
- * It's not enough that it's not actively running,
- * it must be off the runqueue _entirely_, and not
- * preempted!
- *
- * So if it was still runnable (but just not actively
- * running right now), it's preempted, and we should
- * yield - it could be a while.
- */
- if (unlikely(queued)) {
- ktime_t to = NSEC_PER_SEC / HZ;
-
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
- continue;
- }
-
- /*
- * Ahh, all good. It wasn't running, and it wasn't
- * runnable, which means that it will never become
- * running in the future either. We're all done!
- */
- break;
- }
-
- return ncsw;
-}
-
/***
* kick_process - kick a running thread to enter/exit the kernel
* @p: the to-be-kicked thread
@@ -4003,15 +4044,14 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
static __always_inline
bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
{
+ int match;
+
if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
state != TASK_RTLOCK_WAIT);
}
- if (READ_ONCE(p->__state) & state) {
- *success = 1;
- return true;
- }
+ *success = !!(match = __task_state_match(p, state));
#ifdef CONFIG_PREEMPT_RT
/*
@@ -4027,12 +4067,10 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
* p::saved_state to TASK_RUNNING so any further tests will
* not result in false positives vs. @success
*/
- if (p->saved_state & state) {
+ if (match < 0)
p->saved_state = TASK_RUNNING;
- *success = 1;
- }
#endif
- return false;
+ return match > 0;
}
/*
@@ -9548,6 +9586,7 @@ void set_rq_offline(struct rq *rq)
if (rq->online) {
const struct sched_class *class;
+ update_rq_clock(rq);
for_each_class(class) {
if (class->rq_offline)
class->rq_offline(rq);
@@ -9689,7 +9728,6 @@ int sched_cpu_deactivate(unsigned int cpu)
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
- update_rq_clock(rq);
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e3211455b203..4492608b7d7f 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -155,10 +155,11 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
+ unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu);
struct rq *rq = cpu_rq(sg_cpu->cpu);
sg_cpu->bw_dl = cpu_bw_dl(rq);
- sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
+ sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util,
FREQUENCY_UTIL, NULL);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5a9a4b81c972..e41a36bd66a6 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -489,13 +489,6 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
-void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
-{
- raw_spin_lock_init(&dl_b->dl_runtime_lock);
- dl_b->dl_period = period;
- dl_b->dl_runtime = runtime;
-}
-
void init_dl_bw(struct dl_bw *dl_b)
{
raw_spin_lock_init(&dl_b->lock);
@@ -1260,43 +1253,39 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
}
/*
- * This function implements the GRUB accounting rule:
- * according to the GRUB reclaiming algorithm, the runtime is
- * not decreased as "dq = -dt", but as
- * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt",
+ * This function implements the GRUB accounting rule. According to the
+ * GRUB reclaiming algorithm, the runtime is not decreased as "dq = -dt",
+ * but as "dq = -(max{u, (Umax - Uinact - Uextra)} / Umax) dt",
* where u is the utilization of the task, Umax is the maximum reclaimable
* utilization, Uinact is the (per-runqueue) inactive utilization, computed
* as the difference between the "total runqueue utilization" and the
- * runqueue active utilization, and Uextra is the (per runqueue) extra
+ * "runqueue active utilization", and Uextra is the (per runqueue) extra
* reclaimable utilization.
- * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
- * multiplied by 2^BW_SHIFT, the result has to be shifted right by
- * BW_SHIFT.
- * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT,
- * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
- * Since delta is a 64 bit variable, to have an overflow its value
- * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
- * So, overflow is not an issue here.
+ * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied
+ * by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT.
+ * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw
+ * is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
+ * Since delta is a 64 bit variable, to have an overflow its value should be
+ * larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is
+ * not an issue here.
*/
static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
{
- u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
u64 u_act;
- u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT;
+ u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
/*
- * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)},
- * we compare u_inact + rq->dl.extra_bw with
- * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because
- * u_inact + rq->dl.extra_bw can be larger than
- * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative
- * leading to wrong results)
+ * Instead of computing max{u, (u_max - u_inact - u_extra)}, we
+ * compare u_inact + u_extra with u_max - u, because u_inact + u_extra
+ * can be larger than u_max. So, u_max - u_inact - u_extra would be
+ * negative leading to wrong results.
*/
- if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min)
- u_act = u_act_min;
+ if (u_inact + rq->dl.extra_bw > rq->dl.max_bw - dl_se->dl_bw)
+ u_act = dl_se->dl_bw;
else
- u_act = BW_UNIT - u_inact - rq->dl.extra_bw;
+ u_act = rq->dl.max_bw - u_inact - rq->dl.extra_bw;
+ u_act = (u_act * rq->dl.bw_ratio) >> RATIO_SHIFT;
return (delta * u_act) >> BW_SHIFT;
}
@@ -2795,12 +2784,12 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
{
if (global_rt_runtime() == RUNTIME_INF) {
dl_rq->bw_ratio = 1 << RATIO_SHIFT;
- dl_rq->extra_bw = 1 << BW_SHIFT;
+ dl_rq->max_bw = dl_rq->extra_bw = 1 << BW_SHIFT;
} else {
dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
- dl_rq->extra_bw = to_ratio(global_rt_period(),
- global_rt_runtime());
+ dl_rq->max_bw = dl_rq->extra_bw =
+ to_ratio(global_rt_period(), global_rt_runtime());
}
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 0b2340a79b65..066ff1c8ae4e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -777,7 +777,7 @@ static void print_cpu(struct seq_file *m, int cpu)
#define P(x) \
do { \
if (sizeof(rq->x) == 4) \
- SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
+ SEQ_printf(m, " .%-30s: %d\n", #x, (int)(rq->x)); \
else \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
} while (0)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 373ff5f55884..a80a73909dc2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1064,6 +1064,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
+static inline bool is_core_idle(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+ int sibling;
+
+ for_each_cpu(sibling, cpu_smt_mask(cpu)) {
+ if (cpu == sibling)
+ continue;
+
+ if (!idle_cpu(sibling))
+ return false;
+ }
+#endif
+
+ return true;
+}
+
#ifdef CONFIG_NUMA
#define NUMA_IMBALANCE_MIN 2
@@ -1700,23 +1717,6 @@ struct numa_stats {
int idle_cpu;
};
-static inline bool is_core_idle(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
- int sibling;
-
- for_each_cpu(sibling, cpu_smt_mask(cpu)) {
- if (cpu == sibling)
- continue;
-
- if (!idle_cpu(sibling))
- return false;
- }
-#endif
-
- return true;
-}
-
struct task_numa_env {
struct task_struct *p;
@@ -5577,6 +5577,14 @@ static void __cfsb_csd_unthrottle(void *arg)
rq_lock(rq, &rf);
/*
+ * Iterating over the list can trigger several call to
+ * update_rq_clock() in unthrottle_cfs_rq().
+ * Do it once and skip the potential next ones.
+ */
+ update_rq_clock(rq);
+ rq_clock_start_loop_update(rq);
+
+ /*
* Since we hold rq lock we're safe from concurrent manipulation of
* the CSD list. However, this RCU critical section annotates the
* fact that we pair with sched_free_group_rcu(), so that we cannot
@@ -5595,6 +5603,7 @@ static void __cfsb_csd_unthrottle(void *arg)
rcu_read_unlock();
+ rq_clock_stop_loop_update(rq);
rq_unlock(rq, &rf);
}
@@ -6115,6 +6124,13 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
lockdep_assert_rq_held(rq);
+ /*
+ * The rq clock has already been updated in the
+ * set_rq_offline(), so we should skip updating
+ * the rq clock again in unthrottle_cfs_rq().
+ */
+ rq_clock_start_loop_update(rq);
+
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
@@ -6137,6 +6153,8 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
unthrottle_cfs_rq(cfs_rq);
}
rcu_read_unlock();
+
+ rq_clock_stop_loop_update(rq);
}
#else /* CONFIG_CFS_BANDWIDTH */
@@ -7202,14 +7220,58 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
}
-/*
- * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu
- * (@dst_cpu = -1) or migrated to @dst_cpu.
- */
-static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
+/**
+ * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks.
+ * @cpu: the CPU to get the utilization for
+ * @p: task for which the CPU utilization should be predicted or NULL
+ * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL
+ * @boost: 1 to enable boosting, otherwise 0
+ *
+ * The unit of the return value must be the same as the one of CPU capacity
+ * so that CPU utilization can be compared with CPU capacity.
+ *
+ * CPU utilization is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on that CPU.
+ * It represents the amount of CPU capacity currently used by CFS tasks in
+ * the range [0..max CPU capacity] with max CPU capacity being the CPU
+ * capacity at f_max.
+ *
+ * The estimated CPU utilization is defined as the maximum between CPU
+ * utilization and sum of the estimated utilization of the currently
+ * runnable tasks on that CPU. It preserves a utilization "snapshot" of
+ * previously-executed tasks, which helps better deduce how busy a CPU will
+ * be when a long-sleeping task wakes up. The contribution to CPU utilization
+ * of such a task would be significantly decayed at this point of time.
+ *
+ * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization).
+ * CPU contention for CFS tasks can be detected by CPU runnable > CPU
+ * utilization. Boosting is implemented in cpu_util() so that internal
+ * users (e.g. EAS) can use it next to external users (e.g. schedutil),
+ * latter via cpu_util_cfs_boost().
+ *
+ * CPU utilization can be higher than the current CPU capacity
+ * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
+ * of rounding errors as well as task migrations or wakeups of new tasks.
+ * CPU utilization has to be capped to fit into the [0..max CPU capacity]
+ * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
+ * could be seen as over-utilized even though CPU1 has 20% of spare CPU
+ * capacity. CPU utilization is allowed to overshoot current CPU capacity
+ * though since this is useful for predicting the CPU capacity required
+ * after task migrations (scheduler-driven DVFS).
+ *
+ * Return: (Boosted) (estimated) utilization for the specified CPU.
+ */
+static unsigned long
+cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
{
struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
+ unsigned long runnable;
+
+ if (boost) {
+ runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
+ util = max(util, runnable);
+ }
/*
* If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
@@ -7217,9 +7279,9 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
* contribution. In all the other cases @cpu is not impacted by the
* migration so its util_avg is already correct.
*/
- if (task_cpu(p) == cpu && dst_cpu != cpu)
+ if (p && task_cpu(p) == cpu && dst_cpu != cpu)
lsub_positive(&util, task_util(p));
- else if (task_cpu(p) != cpu && dst_cpu == cpu)
+ else if (p && task_cpu(p) != cpu && dst_cpu == cpu)
util += task_util(p);
if (sched_feat(UTIL_EST)) {
@@ -7227,6 +7289,9 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+ if (boost)
+ util_est = max(util_est, runnable);
+
/*
* During wake-up @p isn't enqueued yet and doesn't contribute
* to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
@@ -7255,7 +7320,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
- else if (unlikely(task_on_rq_queued(p) || current == p))
+ else if (p && unlikely(task_on_rq_queued(p) || current == p))
lsub_positive(&util_est, _task_util_est(p));
util = max(util, util_est);
@@ -7264,6 +7329,16 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
return min(util, capacity_orig_of(cpu));
}
+unsigned long cpu_util_cfs(int cpu)
+{
+ return cpu_util(cpu, NULL, -1, 0);
+}
+
+unsigned long cpu_util_cfs_boost(int cpu)
+{
+ return cpu_util(cpu, NULL, -1, 1);
+}
+
/*
* cpu_util_without: compute cpu utilization without any contributions from *p
* @cpu: the CPU which utilization is requested
@@ -7281,9 +7356,9 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
{
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
- return cpu_util_cfs(cpu);
+ p = NULL;
- return cpu_util_next(cpu, p, -1);
+ return cpu_util(cpu, p, -1, 0);
}
/*
@@ -7330,7 +7405,7 @@ static inline void eenv_task_busy_time(struct energy_env *eenv,
* cpu_capacity.
*
* The contribution of the task @p for which we want to estimate the
- * energy cost is removed (by cpu_util_next()) and must be calculated
+ * energy cost is removed (by cpu_util()) and must be calculated
* separately (see eenv_task_busy_time). This ensures:
*
* - A stable PD utilization, no matter which CPU of that PD we want to place
@@ -7351,7 +7426,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
int cpu;
for_each_cpu(cpu, pd_cpus) {
- unsigned long util = cpu_util_next(cpu, p, -1);
+ unsigned long util = cpu_util(cpu, p, -1, 0);
busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL);
}
@@ -7375,8 +7450,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
for_each_cpu(cpu, pd_cpus) {
struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
- unsigned long util = cpu_util_next(cpu, p, dst_cpu);
- unsigned long cpu_util;
+ unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
+ unsigned long eff_util;
/*
* Performance domain frequency: utilization clamping
@@ -7385,8 +7460,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
- cpu_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
- max_util = max(max_util, cpu_util);
+ eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
+ max_util = max(max_util, eff_util);
}
return min(max_util, eenv->cpu_cap);
@@ -7521,7 +7596,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
- util = cpu_util_next(cpu, p, cpu);
+ util = cpu_util(cpu, p, cpu, 0);
cpu_cap = capacity_of(cpu);
/*
@@ -9331,96 +9406,61 @@ group_type group_classify(unsigned int imbalance_pct,
}
/**
- * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks
- * @dst_cpu: Destination CPU of the load balancing
+ * sched_use_asym_prio - Check whether asym_packing priority must be used
+ * @sd: The scheduling domain of the load balancing
+ * @cpu: A CPU
+ *
+ * Always use CPU priority when balancing load between SMT siblings. When
+ * balancing load between cores, it is not sufficient that @cpu is idle. Only
+ * use CPU priority if the whole core is idle.
+ *
+ * Returns: True if the priority of @cpu must be followed. False otherwise.
+ */
+static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
+{
+ if (!sched_smt_active())
+ return true;
+
+ return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
+}
+
+/**
+ * sched_asym - Check if the destination CPU can do asym_packing load balance
+ * @env: The load balancing environment
* @sds: Load-balancing data with statistics of the local group
* @sgs: Load-balancing statistics of the candidate busiest group
- * @sg: The candidate busiest group
+ * @group: The candidate busiest group
*
- * Check the state of the SMT siblings of both @sds::local and @sg and decide
- * if @dst_cpu can pull tasks.
+ * @env::dst_cpu can do asym_packing if it has higher priority than the
+ * preferred CPU of @group.
*
- * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of
- * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
- * only if @dst_cpu has higher priority.
+ * SMT is a special case. If we are balancing load between cores, @env::dst_cpu
+ * can do asym_packing balance only if all its SMT siblings are idle. Also, it
+ * can only do it if @group is an SMT group and has exactly on busy CPU. Larger
+ * imbalances in the number of CPUS are dealt with in find_busiest_group().
*
- * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more
- * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority.
- * Bigger imbalances in the number of busy CPUs will be dealt with in
- * update_sd_pick_busiest().
+ * If we are balancing load within an SMT core, or at DIE domain level, always
+ * proceed.
*
- * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
- * of @dst_cpu are idle and @sg has lower priority.
- *
- * Return: true if @dst_cpu can pull tasks, false otherwise.
+ * Return: true if @env::dst_cpu can do with asym_packing load balance. False
+ * otherwise.
*/
-static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
- struct sg_lb_stats *sgs,
- struct sched_group *sg)
+static inline bool
+sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs,
+ struct sched_group *group)
{
-#ifdef CONFIG_SCHED_SMT
- bool local_is_smt, sg_is_smt;
- int sg_busy_cpus;
-
- local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
- sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
-
- sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
-
- if (!local_is_smt) {
- /*
- * If we are here, @dst_cpu is idle and does not have SMT
- * siblings. Pull tasks if candidate group has two or more
- * busy CPUs.
- */
- if (sg_busy_cpus >= 2) /* implies sg_is_smt */
- return true;
-
- /*
- * @dst_cpu does not have SMT siblings. @sg may have SMT
- * siblings and only one is busy. In such case, @dst_cpu
- * can help if it has higher priority and is idle (i.e.,
- * it has no running tasks).
- */
- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
- }
-
- /* @dst_cpu has SMT siblings. */
-
- if (sg_is_smt) {
- int local_busy_cpus = sds->local->group_weight -
- sds->local_stat.idle_cpus;
- int busy_cpus_delta = sg_busy_cpus - local_busy_cpus;
-
- if (busy_cpus_delta == 1)
- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
-
+ /* Ensure that the whole local core is idle, if applicable. */
+ if (!sched_use_asym_prio(env->sd, env->dst_cpu))
return false;
- }
/*
- * @sg does not have SMT siblings. Ensure that @sds::local does not end
- * up with more than one busy SMT sibling and only pull tasks if there
- * are not busy CPUs (i.e., no CPU has running tasks).
+ * CPU priorities does not make sense for SMT cores with more than one
+ * busy sibling.
*/
- if (!sds->local_stat.sum_nr_running)
- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
-
- return false;
-#else
- /* Always return false so that callers deal with non-SMT cases. */
- return false;
-#endif
-}
-
-static inline bool
-sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs,
- struct sched_group *group)
-{
- /* Only do SMT checks if either local or candidate have SMT siblings */
- if ((sds->local->flags & SD_SHARE_CPUCAPACITY) ||
- (group->flags & SD_SHARE_CPUCAPACITY))
- return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
+ if (group->flags & SD_SHARE_CPUCAPACITY) {
+ if (sgs->group_weight - sgs->idle_cpus != 1)
+ return false;
+ }
return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
}
@@ -9610,10 +9650,22 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* contention when accessing shared HW resources.
*
* XXX for now avg_load is not computed and always 0 so we
- * select the 1st one.
+ * select the 1st one, except if @sg is composed of SMT
+ * siblings.
*/
- if (sgs->avg_load <= busiest->avg_load)
+
+ if (sgs->avg_load < busiest->avg_load)
return false;
+
+ if (sgs->avg_load == busiest->avg_load) {
+ /*
+ * SMT sched groups need more help than non-SMT groups.
+ * If @sg happens to also be SMT, either choice is good.
+ */
+ if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
+ return false;
+ }
+
break;
case group_has_spare:
@@ -10088,7 +10140,6 @@ static void update_idle_cpu_scan(struct lb_env *env,
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
- struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
@@ -10129,8 +10180,13 @@ next_group:
sg = sg->next;
} while (sg != env->sd->groups);
- /* Tag domain that child domain prefers tasks go to siblings first */
- sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+ /*
+ * Indicate that the child domain of the busiest group prefers tasks
+ * go to a child's sibling domains first. NB the flags of a sched group
+ * are those of the child domain.
+ */
+ if (sds->busiest)
+ sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
if (env->sd->flags & SD_NUMA)
@@ -10440,7 +10496,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto out_balanced;
}
- /* Try to move all excess tasks to child's sibling domain */
+ /*
+ * Try to move all excess tasks to a sibling domain of the busiest
+ * group's child domain.
+ */
if (sds.prefer_sibling && local->group_type == group_has_spare &&
busiest->sum_nr_running > local->sum_nr_running + 1)
goto force_balance;
@@ -10542,8 +10601,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
nr_running == 1)
continue;
- /* Make sure we only pull tasks from a CPU of lower priority */
+ /*
+ * Make sure we only pull tasks from a CPU of lower priority
+ * when balancing between SMT siblings.
+ *
+ * If balancing between cores, let lower priority CPUs help
+ * SMT cores with more than one busy sibling.
+ */
if ((env->sd->flags & SD_ASYM_PACKING) &&
+ sched_use_asym_prio(env->sd, i) &&
sched_asym_prefer(i, env->dst_cpu) &&
nr_running == 1)
continue;
@@ -10581,7 +10647,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
break;
case migrate_util:
- util = cpu_util_cfs(i);
+ util = cpu_util_cfs_boost(i);
/*
* Don't try to pull utilization from a CPU with one
@@ -10632,12 +10698,19 @@ static inline bool
asym_active_balance(struct lb_env *env)
{
/*
- * ASYM_PACKING needs to force migrate tasks from busy but
- * lower priority CPUs in order to pack all tasks in the
- * highest priority CPUs.
+ * ASYM_PACKING needs to force migrate tasks from busy but lower
+ * priority CPUs in order to pack all tasks in the highest priority
+ * CPUs. When done between cores, do it only if the whole core if the
+ * whole core is idle.
+ *
+ * If @env::src_cpu is an SMT core with busy siblings, let
+ * the lower priority @env::dst_cpu help it. Do not follow
+ * CPU priority.
*/
return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
- sched_asym_prefer(env->dst_cpu, env->src_cpu);
+ sched_use_asym_prio(env->sd, env->dst_cpu) &&
+ (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
+ !sched_use_asym_prio(env->sd, env->src_cpu));
}
static inline bool
@@ -10744,7 +10817,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
- .dst_grpmask = sched_group_span(sd->groups),
+ .dst_grpmask = group_balance_mask(sd->groups),
.idle = idle,
.loop_break = SCHED_NR_MIGRATE_BREAK,
.cpus = cpus,
@@ -11371,9 +11444,13 @@ static void nohz_balancer_kick(struct rq *rq)
* When ASYM_PACKING; see if there's a more preferred CPU
* currently idle; in which case, kick the ILB to move tasks
* around.
+ *
+ * When balancing betwen cores, all the SMT siblings of the
+ * preferred CPU must be idle.
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
- if (sched_asym_prefer(i, cpu)) {
+ if (sched_use_asym_prio(sd, i) &&
+ sched_asym_prefer(i, cpu)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index e072f6b31bf3..81fca77397f6 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -160,7 +160,6 @@ __setup("psi=", setup_psi);
#define EXP_300s 2034 /* 1/exp(2s/300s) */
/* PSI trigger definitions */
-#define WINDOW_MIN_US 500000 /* Min window size is 500ms */
#define WINDOW_MAX_US 10000000 /* Max window size is 10s */
#define UPDATES_PER_WINDOW 10 /* 10 updates per window */
@@ -1305,8 +1304,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
if (state >= PSI_NONIDLE)
return ERR_PTR(-EINVAL);
- if (window_us < WINDOW_MIN_US ||
- window_us > WINDOW_MAX_US)
+ if (window_us == 0 || window_us > WINDOW_MAX_US)
return ERR_PTR(-EINVAL);
/*
@@ -1409,11 +1407,16 @@ void psi_trigger_destroy(struct psi_trigger *t)
group->rtpoll_nr_triggers[t->state]--;
if (!group->rtpoll_nr_triggers[t->state])
group->rtpoll_states &= ~(1 << t->state);
- /* reset min update period for the remaining triggers */
- list_for_each_entry(tmp, &group->rtpoll_triggers, node)
- period = min(period, div_u64(tmp->win.size,
- UPDATES_PER_WINDOW));
- group->rtpoll_min_period = period;
+ /*
+ * Reset min update period for the remaining triggers
+ * iff the destroying trigger had the min window size.
+ */
+ if (group->rtpoll_min_period == div_u64(t->win.size, UPDATES_PER_WINDOW)) {
+ list_for_each_entry(tmp, &group->rtpoll_triggers, node)
+ period = min(period, div_u64(tmp->win.size,
+ UPDATES_PER_WINDOW));
+ group->rtpoll_min_period = period;
+ }
/* Destroy rtpoll_task when the last trigger is destroyed */
if (group->rtpoll_states == 0) {
group->rtpoll_until = 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ec7b3e0a2b20..50d4b61aef3a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -286,12 +286,6 @@ struct rt_bandwidth {
void __dl_clear_params(struct task_struct *p);
-struct dl_bandwidth {
- raw_spinlock_t dl_runtime_lock;
- u64 dl_runtime;
- u64 dl_period;
-};
-
static inline int dl_bandwidth_enabled(void)
{
return sysctl_sched_rt_runtime >= 0;
@@ -754,6 +748,12 @@ struct dl_rq {
u64 extra_bw;
/*
+ * Maximum available bandwidth for reclaiming by SCHED_FLAG_RECLAIM
+ * tasks of this rq. Used in calculation of reclaimable bandwidth(GRUB).
+ */
+ u64 max_bw;
+
+ /*
* Inverse of the fraction of CPU utilization that can be reclaimed
* by the GRUB algorithm.
*/
@@ -1546,6 +1546,28 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq)
rq->clock_update_flags &= ~RQCF_REQ_SKIP;
}
+/*
+ * During cpu offlining and rq wide unthrottling, we can trigger
+ * an update_rq_clock() for several cfs and rt runqueues (Typically
+ * when using list_for_each_entry_*)
+ * rq_clock_start_loop_update() can be called after updating the clock
+ * once and before iterating over the list to prevent multiple update.
+ * After the iterative traversal, we need to call rq_clock_stop_loop_update()
+ * to clear RQCF_ACT_SKIP of rq->clock_update_flags.
+ */
+static inline void rq_clock_start_loop_update(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP);
+ rq->clock_update_flags |= RQCF_ACT_SKIP;
+}
+
+static inline void rq_clock_stop_loop_update(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ rq->clock_update_flags &= ~RQCF_ACT_SKIP;
+}
+
struct rq_flags {
unsigned long flags;
struct pin_cookie cookie;
@@ -1772,6 +1794,13 @@ queue_balance_callback(struct rq *rq,
for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
__sd; __sd = __sd->parent)
+/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) |
+static const unsigned int SD_SHARED_CHILD_MASK =
+#include <linux/sched/sd_flags.h>
+0;
+#undef SD_FLAG
+
/**
* highest_flag_domain - Return highest sched_domain containing flag.
* @cpu: The CPU whose highest level of sched domain is to
@@ -1779,16 +1808,25 @@ queue_balance_callback(struct rq *rq,
* @flag: The flag to check for the highest sched_domain
* for the given CPU.
*
- * Returns the highest sched_domain of a CPU which contains the given flag.
+ * Returns the highest sched_domain of a CPU which contains @flag. If @flag has
+ * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag.
*/
static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
{
struct sched_domain *sd, *hsd = NULL;
for_each_domain(cpu, sd) {
- if (!(sd->flags & flag))
+ if (sd->flags & flag) {
+ hsd = sd;
+ continue;
+ }
+
+ /*
+ * Stop the search if @flag is known to be shared at lower
+ * levels. It will not be found further up.
+ */
+ if (flag & SD_SHARED_CHILD_MASK)
break;
- hsd = sd;
}
return hsd;
@@ -2378,7 +2416,6 @@ extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
-extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
@@ -2946,53 +2983,9 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
return READ_ONCE(rq->avg_dl.util_avg);
}
-/**
- * cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks.
- * @cpu: the CPU to get the utilization for.
- *
- * The unit of the return value must be the same as the one of CPU capacity
- * so that CPU utilization can be compared with CPU capacity.
- *
- * CPU utilization is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on that CPU.
- * It represents the amount of CPU capacity currently used by CFS tasks in
- * the range [0..max CPU capacity] with max CPU capacity being the CPU
- * capacity at f_max.
- *
- * The estimated CPU utilization is defined as the maximum between CPU
- * utilization and sum of the estimated utilization of the currently
- * runnable tasks on that CPU. It preserves a utilization "snapshot" of
- * previously-executed tasks, which helps better deduce how busy a CPU will
- * be when a long-sleeping task wakes up. The contribution to CPU utilization
- * of such a task would be significantly decayed at this point of time.
- *
- * CPU utilization can be higher than the current CPU capacity
- * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
- * of rounding errors as well as task migrations or wakeups of new tasks.
- * CPU utilization has to be capped to fit into the [0..max CPU capacity]
- * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
- * could be seen as over-utilized even though CPU1 has 20% of spare CPU
- * capacity. CPU utilization is allowed to overshoot current CPU capacity
- * though since this is useful for predicting the CPU capacity required
- * after task migrations (scheduler-driven DVFS).
- *
- * Return: (Estimated) utilization for the specified CPU.
- */
-static inline unsigned long cpu_util_cfs(int cpu)
-{
- struct cfs_rq *cfs_rq;
- unsigned long util;
-
- cfs_rq = &cpu_rq(cpu)->cfs;
- util = READ_ONCE(cfs_rq->avg.util_avg);
-
- if (sched_feat(UTIL_EST)) {
- util = max_t(unsigned long, util,
- READ_ONCE(cfs_rq->avg.util_est.enqueued));
- }
- return min(util, capacity_orig_of(cpu));
-}
+extern unsigned long cpu_util_cfs(int cpu);
+extern unsigned long cpu_util_cfs_boost(int cpu);
static inline unsigned long cpu_util_rt(struct rq *rq)
{
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6682535e37c8..d3a3b2646ec4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -487,9 +487,9 @@ static void free_rootdomain(struct rcu_head *rcu)
void rq_attach_root(struct rq *rq, struct root_domain *rd)
{
struct root_domain *old_rd = NULL;
- unsigned long flags;
+ struct rq_flags rf;
- raw_spin_rq_lock_irqsave(rq, flags);
+ rq_lock_irqsave(rq, &rf);
if (rq->rd) {
old_rd = rq->rd;
@@ -515,7 +515,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);
- raw_spin_rq_unlock_irqrestore(rq, flags);
+ rq_unlock_irqrestore(rq, &rf);
if (old_rd)
call_rcu(&old_rd->rcu, free_rootdomain);
@@ -719,8 +719,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;
- if (parent->parent)
+
+ if (parent->parent) {
parent->parent->child = tmp;
+ if (tmp->flags & SD_SHARE_CPUCAPACITY)
+ parent->parent->groups->flags |= SD_SHARE_CPUCAPACITY;
+ }
+
/*
* Transfer SD_PREFER_SIBLING down in case of a
* degenerate parent; the spans match for this
@@ -1676,7 +1681,7 @@ static struct sched_domain_topology_level *sched_domain_topology_saved;
#define for_each_sd_topology(tl) \
for (tl = sched_domain_topology; tl->mask; tl++)
-void set_sched_topology(struct sched_domain_topology_level *tl)
+void __init set_sched_topology(struct sched_domain_topology_level *tl)
{
if (WARN_ON_ONCE(sched_smp_initialized))
return;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 133b74730738..48c53e4739ea 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -425,11 +425,6 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
}
EXPORT_SYMBOL(autoremove_wake_function);
-static inline bool is_kthread_should_stop(void)
-{
- return (current->flags & PF_KTHREAD) && kthread_should_stop();
-}
-
/*
* DEFINE_WAIT_FUNC(wait, woken_wake_func);
*
@@ -459,7 +454,7 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
* or woken_wake_function() sees our store to current->state.
*/
set_current_state(mode); /* A */
- if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
+ if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !kthread_should_stop_or_park())
timeout = schedule_timeout(timeout);
__set_current_state(TASK_RUNNING);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 8464c5acc913..68d6c1190ac7 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -64,7 +64,7 @@ static struct clock_data cd ____cacheline_aligned = {
.actual_read_sched_clock = jiffy_sched_clock_read,
};
-static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
+static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift)
{
return (cyc * mult) >> shift;
}
@@ -77,26 +77,36 @@ notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
notrace int sched_clock_read_retry(unsigned int seq)
{
- return read_seqcount_latch_retry(&cd.seq, seq);
+ return raw_read_seqcount_latch_retry(&cd.seq, seq);
}
-unsigned long long notrace sched_clock(void)
+unsigned long long noinstr sched_clock_noinstr(void)
{
- u64 cyc, res;
- unsigned int seq;
struct clock_read_data *rd;
+ unsigned int seq;
+ u64 cyc, res;
do {
- rd = sched_clock_read_begin(&seq);
+ seq = raw_read_seqcount_latch(&cd.seq);
+ rd = cd.read_data + (seq & 1);
cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
rd->sched_clock_mask;
res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
- } while (sched_clock_read_retry(seq));
+ } while (raw_read_seqcount_latch_retry(&cd.seq, seq));
return res;
}
+unsigned long long notrace sched_clock(void)
+{
+ unsigned long long ns;
+ preempt_disable_notrace();
+ ns = sched_clock_noinstr();
+ preempt_enable_notrace();
+ return ns;
+}
+
/*
* Updating the data required to read the clock.
*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 09d594900ee0..266d02809dbb 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -450,7 +450,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
tkr = tkf->base + (seq & 0x01);
now = ktime_to_ns(tkr->base);
now += fast_tk_get_delta_ns(tkr);
- } while (read_seqcount_latch_retry(&tkf->seq, seq));
+ } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
return now;
}
@@ -566,7 +566,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
basem = ktime_to_ns(tkr->base);
baser = ktime_to_ns(tkr->base_real);
delta = fast_tk_get_delta_ns(tkr);
- } while (read_seqcount_latch_retry(&tkf->seq, seq));
+ } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
if (mono)
*mono = basem + delta;