summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/watchdog.c29
-rw-r--r--kernel/watchdog_buddy.c93
3 files changed, 116 insertions, 7 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 7eb72033143c..f9e3fd9195d9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -91,6 +91,7 @@ obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR_BUDDY) += watchdog_buddy.o
obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_perf.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RELAY) += relay.o
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index e67125f64719..10947c835079 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -85,7 +85,7 @@ __setup("nmi_watchdog=", hardlockup_panic_setup);
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
-#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
+#if defined(CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER)
static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts);
static DEFINE_PER_CPU(int, hrtimer_interrupts_saved);
@@ -106,6 +106,14 @@ notrace void arch_touch_nmi_watchdog(void)
}
EXPORT_SYMBOL(arch_touch_nmi_watchdog);
+void watchdog_hardlockup_touch_cpu(unsigned int cpu)
+{
+ per_cpu(watchdog_hardlockup_touched, cpu) = true;
+
+ /* Match with smp_rmb() in watchdog_hardlockup_check() */
+ smp_wmb();
+}
+
static bool is_hardlockup(unsigned int cpu)
{
int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu));
@@ -123,13 +131,16 @@ static bool is_hardlockup(unsigned int cpu)
return false;
}
-static void watchdog_hardlockup_kick(void)
+static unsigned long watchdog_hardlockup_kick(void)
{
- atomic_inc(raw_cpu_ptr(&hrtimer_interrupts));
+ return atomic_inc_return(raw_cpu_ptr(&hrtimer_interrupts));
}
void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
{
+ /* Match with smp_wmb() in watchdog_hardlockup_touch_cpu() */
+ smp_rmb();
+
if (per_cpu(watchdog_hardlockup_touched, cpu)) {
per_cpu(watchdog_hardlockup_touched, cpu) = false;
return;
@@ -180,11 +191,11 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
}
}
-#else /* CONFIG_HARDLOCKUP_DETECTOR_PERF */
+#else /* CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */
-static inline void watchdog_hardlockup_kick(void) { }
+static inline unsigned long watchdog_hardlockup_kick(void) { return 0; }
-#endif /* !CONFIG_HARDLOCKUP_DETECTOR_PERF */
+#endif /* !CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */
/*
* These functions can be overridden based on the configured hardlockdup detector.
@@ -443,11 +454,15 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
struct pt_regs *regs = get_irq_regs();
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
+ unsigned long hrtimer_interrupts;
if (!watchdog_enabled)
return HRTIMER_NORESTART;
- watchdog_hardlockup_kick();
+ hrtimer_interrupts = watchdog_hardlockup_kick();
+
+ /* test for hardlockups */
+ watchdog_buddy_check_hardlockup(hrtimer_interrupts);
/* kick the softlockup detector */
if (completion_done(this_cpu_ptr(&softlockup_completion))) {
diff --git a/kernel/watchdog_buddy.c b/kernel/watchdog_buddy.c
new file mode 100644
index 000000000000..fee45af2e5bd
--- /dev/null
+++ b/kernel/watchdog_buddy.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/kernel.h>
+#include <linux/nmi.h>
+#include <linux/percpu-defs.h>
+
+static cpumask_t __read_mostly watchdog_cpus;
+
+static unsigned int watchdog_next_cpu(unsigned int cpu)
+{
+ cpumask_t cpus = watchdog_cpus;
+ unsigned int next_cpu;
+
+ next_cpu = cpumask_next(cpu, &cpus);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(&cpus);
+
+ if (next_cpu == cpu)
+ return nr_cpu_ids;
+
+ return next_cpu;
+}
+
+int __init watchdog_hardlockup_probe(void)
+{
+ return 0;
+}
+
+void watchdog_hardlockup_enable(unsigned int cpu)
+{
+ unsigned int next_cpu;
+
+ /*
+ * The new CPU will be marked online before the hrtimer interrupt
+ * gets a chance to run on it. If another CPU tests for a
+ * hardlockup on the new CPU before it has run its the hrtimer
+ * interrupt, it will get a false positive. Touch the watchdog on
+ * the new CPU to delay the check for at least 3 sampling periods
+ * to guarantee one hrtimer has run on the new CPU.
+ */
+ watchdog_hardlockup_touch_cpu(cpu);
+
+ /*
+ * We are going to check the next CPU. Our watchdog_hrtimer
+ * need not be zero if the CPU has already been online earlier.
+ * Touch the watchdog on the next CPU to avoid false positive
+ * if we try to check it in less then 3 interrupts.
+ */
+ next_cpu = watchdog_next_cpu(cpu);
+ if (next_cpu < nr_cpu_ids)
+ watchdog_hardlockup_touch_cpu(next_cpu);
+
+ cpumask_set_cpu(cpu, &watchdog_cpus);
+}
+
+void watchdog_hardlockup_disable(unsigned int cpu)
+{
+ unsigned int next_cpu = watchdog_next_cpu(cpu);
+
+ /*
+ * Offlining this CPU will cause the CPU before this one to start
+ * checking the one after this one. If this CPU just finished checking
+ * the next CPU and updating hrtimer_interrupts_saved, and then the
+ * previous CPU checks it within one sample period, it will trigger a
+ * false positive. Touch the watchdog on the next CPU to prevent it.
+ */
+ if (next_cpu < nr_cpu_ids)
+ watchdog_hardlockup_touch_cpu(next_cpu);
+
+ cpumask_clear_cpu(cpu, &watchdog_cpus);
+}
+
+void watchdog_buddy_check_hardlockup(unsigned long hrtimer_interrupts)
+{
+ unsigned int next_cpu;
+
+ /*
+ * Test for hardlockups every 3 samples. The sample period is
+ * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
+ * watchdog_thresh (over by 20%).
+ */
+ if (hrtimer_interrupts % 3 != 0)
+ return;
+
+ /* check for a hardlockup on the next CPU */
+ next_cpu = watchdog_next_cpu(smp_processor_id());
+ if (next_cpu >= nr_cpu_ids)
+ return;
+
+ watchdog_hardlockup_check(next_cpu, NULL);
+}