diff options
87 files changed, 2661 insertions, 745 deletions
diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index cce768afec6b..b70b1d8bd8e6 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -138,11 +138,10 @@ associated with the source address of the indirect branch. Specifically, the BHB might be shared across privilege levels even in the presence of Enhanced IBRS. -Currently the only known real-world BHB attack vector is via -unprivileged eBPF. Therefore, it's highly recommended to not enable -unprivileged eBPF, especially when eIBRS is used (without retpolines). -For a full mitigation against BHB attacks, it's recommended to use -retpolines (or eIBRS combined with retpolines). +Previously the only known real-world BHB attack vector was via unprivileged +eBPF. Further research has found attacks that don't require unprivileged eBPF. +For a full mitigation against BHB attacks it is recommended to set BHI_DIS_S or +use the BHB clearing sequence. Attack scenarios ---------------- @@ -430,6 +429,23 @@ The possible values in this file are: 'PBRSB-eIBRS: Not affected' CPU is not affected by PBRSB =========================== ======================================================= + - Branch History Injection (BHI) protection status: + +.. list-table:: + + * - BHI: Not affected + - System is not affected + * - BHI: Retpoline + - System is protected by retpoline + * - BHI: BHI_DIS_S + - System is protected by BHI_DIS_S + * - BHI: SW loop; KVM SW loop + - System is protected by software clearing sequence + * - BHI: Syscall hardening + - Syscalls are hardened against BHI + * - BHI: Syscall hardening; KVM: SW loop + - System is protected from userspace attacks by syscall hardening; KVM is protected by software clearing sequence + Full mitigation might require a microcode update from the CPU vendor. When the necessary microcode is not available, the kernel will report vulnerability. @@ -484,7 +500,11 @@ Spectre variant 2 Systems which support enhanced IBRS (eIBRS) enable IBRS protection once at boot, by setting the IBRS bit, and they're automatically protected against - Spectre v2 variant attacks. + some Spectre v2 variant attacks. The BHB can still influence the choice of + indirect branch predictor entry, and although branch predictor entries are + isolated between modes when eIBRS is enabled, the BHB itself is not isolated + between modes. Systems which support BHI_DIS_S will set it to protect against + BHI attacks. On Intel's enhanced IBRS systems, this includes cross-thread branch target injections on SMT systems (STIBP). In other words, Intel eIBRS enables @@ -638,6 +658,22 @@ kernel command line. spectre_v2=off. Spectre variant 1 mitigations cannot be disabled. + spectre_bhi= + + [X86] Control mitigation of Branch History Injection + (BHI) vulnerability. Syscalls are hardened against BHI + regardless of this setting. This setting affects the deployment + of the HW BHI control and the SW BHB clearing sequence. + + on + unconditionally enable. + off + unconditionally disable. + auto + enable if hardware mitigation + control(BHI_DIS_S) is available, otherwise + enable alternate mitigation in KVM. + For spectre_v2_user see Documentation/admin-guide/kernel-parameters.txt Mitigation selection guide diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 623fce7d5fcd..70046a019d42 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6063,6 +6063,18 @@ sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/admin-guide/laptops/sonypi.rst + spectre_bhi= [X86] Control mitigation of Branch History Injection + (BHI) vulnerability. Syscalls are hardened against BHI + reglardless of this setting. This setting affects the + deployment of the HW BHI control and the SW BHB + clearing sequence. + + on - unconditionally enable. + off - unconditionally disable. + auto - (default) enable hardware mitigation + (BHI_DIS_S) if available, otherwise enable + alternate mitigation in KVM. + spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2 (indirect branch speculation) vulnerability. The default operation protects the kernel from diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst new file mode 100644 index 000000000000..e2bd61ccd96f --- /dev/null +++ b/Documentation/filesystems/bcachefs/index.rst @@ -0,0 +1,11 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +bcachefs Documentation +====================== + +.. toctree:: + :maxdepth: 2 + :numbered: + + errorcodes diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 0ea1e44fa028..1f9b4c905a6a 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -69,6 +69,7 @@ Documentation for filesystem implementations. afs autofs autofs-mount-control + bcachefs/index befs bfs btrfs diff --git a/MAINTAINERS b/MAINTAINERS index aea47e04c3a5..c64203a9f50e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3573,6 +3573,7 @@ S: Supported C: irc://irc.oftc.net/bcache T: git https://evilpiepirate.org/git/bcachefs.git F: fs/bcachefs/ +F: Documentation/filesystems/bcachefs/ BDISP ST MEDIA DRIVER M: Fabien Dessenne <fabien.dessenne@foss.st.com> @@ -22577,6 +22578,7 @@ Q: https://patchwork.kernel.org/project/linux-pm/list/ B: https://bugzilla.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git turbostat F: tools/power/x86/turbostat/ +F: tools/testing/selftests/turbostat/ TW5864 VIDEO4LINUX DRIVER M: Bluecherry Maintainers <maintainers@bluecherrydvr.com> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4fff6ed46e90..10a6251f58f3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2633,6 +2633,32 @@ config MITIGATION_RFDS stored in floating point, vector and integer registers. See also <file:Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst> +choice + prompt "Clear branch history" + depends on CPU_SUP_INTEL + default SPECTRE_BHI_ON + help + Enable BHI mitigations. BHI attacks are a form of Spectre V2 attacks + where the branch history buffer is poisoned to speculatively steer + indirect branches. + See <file:Documentation/admin-guide/hw-vuln/spectre.rst> + +config SPECTRE_BHI_ON + bool "on" + help + Equivalent to setting spectre_bhi=on command line parameter. +config SPECTRE_BHI_OFF + bool "off" + help + Equivalent to setting spectre_bhi=off command line parameter. +config SPECTRE_BHI_AUTO + bool "auto" + depends on BROKEN + help + Equivalent to setting spectre_bhi=auto command line parameter. + +endchoice + endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 6356060caaf3..6de50b80702e 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -49,7 +49,7 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) if (likely(unr < NR_syscalls)) { unr = array_index_nospec(unr, NR_syscalls); - regs->ax = sys_call_table[unr](regs); + regs->ax = x64_sys_call(regs, unr); return true; } return false; @@ -66,7 +66,7 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { xnr = array_index_nospec(xnr, X32_NR_syscalls); - regs->ax = x32_sys_call_table[xnr](regs); + regs->ax = x32_sys_call(regs, xnr); return true; } return false; @@ -162,7 +162,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) if (likely(unr < IA32_NR_syscalls)) { unr = array_index_nospec(unr, IA32_NR_syscalls); - regs->ax = ia32_sys_call_table[unr](regs); + regs->ax = ia32_sys_call(regs, unr); } else if (nr != -1) { regs->ax = __ia32_sys_ni_syscall(regs); } @@ -189,7 +189,7 @@ static __always_inline bool int80_is_external(void) } /** - * int80_emulation - 32-bit legacy syscall entry + * do_int80_emulation - 32-bit legacy syscall C entry from asm * * This entry point can be used by 32-bit and 64-bit programs to perform * 32-bit system calls. Instances of INT $0x80 can be found inline in @@ -207,7 +207,7 @@ static __always_inline bool int80_is_external(void) * eax: system call number * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6 */ -DEFINE_IDTENTRY_RAW(int80_emulation) +__visible noinstr void do_int80_emulation(struct pt_regs *regs) { int nr; diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 8af2a26b24f6..1b5be07f8669 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -116,6 +116,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) /* clobbers %rax, make sure it is after saving the syscall nr */ IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY call do_syscall_64 /* returns with IRQs disabled */ @@ -1491,3 +1492,63 @@ SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead) call make_task_dead SYM_CODE_END(rewind_stack_and_make_dead) .popsection + +/* + * This sequence executes branches in order to remove user branch information + * from the branch history tracker in the Branch Predictor, therefore removing + * user influence on subsequent BTB lookups. + * + * It should be used on parts prior to Alder Lake. Newer parts should use the + * BHI_DIS_S hardware control instead. If a pre-Alder Lake part is being + * virtualized on newer hardware the VMM should protect against BHI attacks by + * setting BHI_DIS_S for the guests. + * + * CALLs/RETs are necessary to prevent Loop Stream Detector(LSD) from engaging + * and not clearing the branch history. The call tree looks like: + * + * call 1 + * call 2 + * call 2 + * call 2 + * call 2 + * call 2 + * ret + * ret + * ret + * ret + * ret + * ret + * + * This means that the stack is non-constant and ORC can't unwind it with %rsp + * alone. Therefore we unconditionally set up the frame pointer, which allows + * ORC to unwind properly. + * + * The alignment is for performance and not for safety, and may be safely + * refactored in the future if needed. + */ +SYM_FUNC_START(clear_bhb_loop) + push %rbp + mov %rsp, %rbp + movl $5, %ecx + ANNOTATE_INTRA_FUNCTION_CALL + call 1f + jmp 5f + .align 64, 0xcc + ANNOTATE_INTRA_FUNCTION_CALL +1: call 2f + RET + .align 64, 0xcc +2: movl $5, %eax +3: jmp 4f + nop +4: sub $1, %eax + jnz 3b + sub $1, %ecx + jnz 1b + RET +5: lfence + pop %rbp + RET +SYM_FUNC_END(clear_bhb_loop) +EXPORT_SYMBOL_GPL(clear_bhb_loop) +STACK_FRAME_NON_STANDARD(clear_bhb_loop) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index eabf48c4d4b4..c779046cc3fe 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -92,6 +92,7 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY /* * SYSENTER doesn't filter flags, so we need to clear NT and AC @@ -206,6 +207,7 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY movq %rsp, %rdi call do_fast_syscall_32 @@ -276,3 +278,17 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR int3 SYM_CODE_END(entry_SYSCALL_compat) + +/* + * int 0x80 is used by 32 bit mode as a system call entry. Normally idt entries + * point to C routines, however since this is a system call interface the branch + * history needs to be scrubbed to protect against BHI attacks, and that + * scrubbing needs to take place in assembly code prior to entering any C + * routines. + */ +SYM_CODE_START(int80_emulation) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + CLEAR_BRANCH_HISTORY + jmp do_int80_emulation +SYM_CODE_END(int80_emulation) diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 8cfc9bc73e7f..c2235bae17ef 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -18,8 +18,25 @@ #include <asm/syscalls_32.h> #undef __SYSCALL +/* + * The sys_call_table[] is no longer used for system calls, but + * kernel/trace/trace_syscalls.c still wants to know the system + * call address. + */ +#ifdef CONFIG_X86_32 #define __SYSCALL(nr, sym) __ia32_##sym, - -__visible const sys_call_ptr_t ia32_sys_call_table[] = { +const sys_call_ptr_t sys_call_table[] = { #include <asm/syscalls_32.h> }; +#undef __SYSCALL +#endif + +#define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs); + +long ia32_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include <asm/syscalls_32.h> + default: return __ia32_sys_ni_syscall(regs); + } +}; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index be120eec1fc9..33b3f09e6f15 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -11,8 +11,23 @@ #include <asm/syscalls_64.h> #undef __SYSCALL +/* + * The sys_call_table[] is no longer used for system calls, but + * kernel/trace/trace_syscalls.c still wants to know the system + * call address. + */ #define __SYSCALL(nr, sym) __x64_##sym, - -asmlinkage const sys_call_ptr_t sys_call_table[] = { +const sys_call_ptr_t sys_call_table[] = { #include <asm/syscalls_64.h> }; +#undef __SYSCALL + +#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs); + +long x64_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include <asm/syscalls_64.h> + default: return __x64_sys_ni_syscall(regs); + } +}; diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index bdd0e03a1265..03de4a932131 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -11,8 +11,12 @@ #include <asm/syscalls_x32.h> #undef __SYSCALL -#define __SYSCALL(nr, sym) __x64_##sym, +#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs); -asmlinkage const sys_call_ptr_t x32_sys_call_table[] = { -#include <asm/syscalls_x32.h> +long x32_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include <asm/syscalls_x32.h> + default: return __x64_sys_ni_syscall(regs); + } }; diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index a38f8f9ba657..3c7434329661 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -461,11 +461,15 @@ /* * Extended auxiliary flags: Linux defined - for features scattered in various - * CPUID levels like 0x80000022, etc. + * CPUID levels like 0x80000022, etc and Linux defined features. * * Reuse free bits when adding new feature flags! */ #define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ +#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */ +#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */ +#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */ +#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */ /* * BUG word(s) @@ -515,4 +519,5 @@ #define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ #define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ #define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */ +#define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 05956bd8bacf..e72c2b872957 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -61,10 +61,13 @@ #define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ #define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */ #define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT) +#define SPEC_CTRL_BHI_DIS_S_SHIFT 10 /* Disable Branch History Injection behavior */ +#define SPEC_CTRL_BHI_DIS_S BIT(SPEC_CTRL_BHI_DIS_S_SHIFT) /* A mask for bits which the kernel toggles when controlling mitigations */ #define SPEC_CTRL_MITIGATIONS_MASK (SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD \ - | SPEC_CTRL_RRSBA_DIS_S) + | SPEC_CTRL_RRSBA_DIS_S \ + | SPEC_CTRL_BHI_DIS_S) #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ @@ -163,6 +166,10 @@ * are restricted to targets in * kernel. */ +#define ARCH_CAP_BHI_NO BIT(20) /* + * CPU is not affected by Branch + * History Injection. + */ #define ARCH_CAP_PBRSB_NO BIT(24) /* * Not susceptible to Post-Barrier * Return Stack Buffer Predictions. diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 170c89ed22fc..ff5f1ecc7d1e 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -326,6 +326,19 @@ ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF .endm +#ifdef CONFIG_X86_64 +.macro CLEAR_BRANCH_HISTORY + ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP +.endm + +.macro CLEAR_BRANCH_HISTORY_VMEXIT + ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT +.endm +#else +#define CLEAR_BRANCH_HISTORY +#define CLEAR_BRANCH_HISTORY_VMEXIT +#endif + #else /* __ASSEMBLY__ */ #define ANNOTATE_RETPOLINE_SAFE \ @@ -368,6 +381,10 @@ extern void srso_alias_return_thunk(void); extern void entry_untrain_ret(void); extern void entry_ibpb(void); +#ifdef CONFIG_X86_64 +extern void clear_bhb_loop(void); +#endif + extern void (*x86_return_thunk)(void); extern void __warn_thunk(void); diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index f44e2f9ab65d..2fc7bc3863ff 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -16,19 +16,17 @@ #include <asm/thread_info.h> /* for TS_COMPAT */ #include <asm/unistd.h> +/* This is used purely for kernel/trace/trace_syscalls.c */ typedef long (*sys_call_ptr_t)(const struct pt_regs *); extern const sys_call_ptr_t sys_call_table[]; -#if defined(CONFIG_X86_32) -#define ia32_sys_call_table sys_call_table -#else /* * These may not exist, but still put the prototypes in so we * can use IS_ENABLED(). */ -extern const sys_call_ptr_t ia32_sys_call_table[]; -extern const sys_call_ptr_t x32_sys_call_table[]; -#endif +extern long ia32_sys_call(const struct pt_regs *, unsigned int nr); +extern long x32_sys_call(const struct pt_regs *, unsigned int nr); +extern long x64_sys_call(const struct pt_regs *, unsigned int nr); /* * Only the low 32 bits of orig_ax are meaningful, so we return int. @@ -127,6 +125,7 @@ static inline int syscall_get_arch(struct task_struct *task) } bool do_syscall_64(struct pt_regs *regs, int nr); +void do_int80_emulation(struct pt_regs *regs); #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e7ba936d798b..295463707e68 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1607,6 +1607,79 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_ dump_stack(); } +/* + * Set BHI_DIS_S to prevent indirect branches in kernel to be influenced by + * branch history in userspace. Not needed if BHI_NO is set. + */ +static bool __init spec_ctrl_bhi_dis(void) +{ + if (!boot_cpu_has(X86_FEATURE_BHI_CTRL)) + return false; + + x86_spec_ctrl_base |= SPEC_CTRL_BHI_DIS_S; + update_spec_ctrl(x86_spec_ctrl_base); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_HW); + + return true; +} + +enum bhi_mitigations { + BHI_MITIGATION_OFF, + BHI_MITIGATION_ON, + BHI_MITIGATION_AUTO, +}; + +static enum bhi_mitigations bhi_mitigation __ro_after_init = + IS_ENABLED(CONFIG_SPECTRE_BHI_ON) ? BHI_MITIGATION_ON : + IS_ENABLED(CONFIG_SPECTRE_BHI_OFF) ? BHI_MITIGATION_OFF : + BHI_MITIGATION_AUTO; + +static int __init spectre_bhi_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + bhi_mitigation = BHI_MITIGATION_OFF; + else if (!strcmp(str, "on")) + bhi_mitigation = BHI_MITIGATION_ON; + else if (!strcmp(str, "auto")) + bhi_mitigation = BHI_MITIGATION_AUTO; + else + pr_err("Ignoring unknown spectre_bhi option (%s)", str); + + return 0; +} +early_param("spectre_bhi", spectre_bhi_parse_cmdline); + +static void __init bhi_select_mitigation(void) +{ + if (bhi_mitigation == BHI_MITIGATION_OFF) + return; + + /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */ + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && + !(x86_read_arch_cap_msr() & ARCH_CAP_RRSBA)) + return; + + if (spec_ctrl_bhi_dis()) + return; + + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + /* Mitigate KVM by default */ + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT); + pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n"); + + if (bhi_mitigation == BHI_MITIGATION_AUTO) + return; + + /* Mitigate syscalls when the mitigation is forced =on */ + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP); + pr_info("Spectre BHI mitigation: SW BHB clearing on syscall\n"); +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -1718,6 +1791,9 @@ static void __init spectre_v2_select_mitigation(void) mode == SPECTRE_V2_RETPOLINE) spec_ctrl_disable_kernel_rrsba(); + if (boot_cpu_has(X86_BUG_BHI)) + bhi_select_mitigation(); + spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); @@ -2695,15 +2771,15 @@ static char *stibp_state(void) switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: - return ", STIBP: disabled"; + return "; STIBP: disabled"; case SPECTRE_V2_USER_STRICT: - return ", STIBP: forced"; + return "; STIBP: forced"; case SPECTRE_V2_USER_STRICT_PREFERRED: - return ", STIBP: always-on"; + return "; STIBP: always-on"; case SPECTRE_V2_USER_PRCTL: case SPECTRE_V2_USER_SECCOMP: if (static_key_enabled(&switch_to_cond_stibp)) - return ", STIBP: conditional"; + return "; STIBP: conditional"; } return ""; } @@ -2712,10 +2788,10 @@ static char *ibpb_state(void) { if (boot_cpu_has(X86_FEATURE_IBPB)) { if (static_key_enabled(&switch_mm_always_ibpb)) - return ", IBPB: always-on"; + return "; IBPB: always-on"; if (static_key_enabled(&switch_mm_cond_ibpb)) - return ", IBPB: conditional"; - return ", IBPB: disabled"; + return "; IBPB: conditional"; + return "; IBPB: disabled"; } return ""; } @@ -2725,14 +2801,31 @@ static char *pbrsb_eibrs_state(void) if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) - return ", PBRSB-eIBRS: SW sequence"; + return "; PBRSB-eIBRS: SW sequence"; else - return ", PBRSB-eIBRS: Vulnerable"; + return "; PBRSB-eIBRS: Vulnerable"; } else { - return ", PBRSB-eIBRS: Not affected"; + return "; PBRSB-eIBRS: Not affected"; } } +static const char * const spectre_bhi_state(void) +{ + if (!boot_cpu_has_bug(X86_BUG_BHI)) + return "; BHI: Not affected"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW)) + return "; BHI: BHI_DIS_S"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) + return "; BHI: SW loop, KVM: SW loop"; + else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && + !(x86_read_arch_cap_msr() & ARCH_CAP_RRSBA)) + return "; BHI: Retpoline"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) + return "; BHI: Syscall hardening, KVM: SW loop"; + + return "; BHI: Vulnerable (Syscall hardening enabled)"; +} + static ssize_t spectre_v2_show_state(char *buf) { if (spectre_v2_enabled == SPECTRE_V2_LFENCE) @@ -2745,13 +2838,15 @@ static ssize_t spectre_v2_show_state(char *buf) spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); - return sysfs_emit(buf, "%s%s%s%s%s%s%s\n", + return sysfs_emit(buf, "%s%s%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? "; IBRS_FW" : "", stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? "; RSB filling" : "", pbrsb_eibrs_state(), + spectre_bhi_state(), + /* this should always be at the end */ spectre_v2_module_string()); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5c1e6d6be267..754d91857d63 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1120,6 +1120,7 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_SPECTRE_V2 BIT(8) #define NO_MMIO BIT(9) #define NO_EIBRS_PBRSB BIT(10) +#define NO_BHI BIT(11) #define VULNWL(vendor, family, model, whitelist) \ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) @@ -1182,18 +1183,18 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI), /* Zhaoxin Family 7 */ - VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), - VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), + VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI), + VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI), {} }; @@ -1435,6 +1436,13 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (vulnerable_to_rfds(ia32_cap)) setup_force_cpu_bug(X86_BUG_RFDS); + /* When virtualized, eIBRS could be hidden, assume vulnerable */ + if (!(ia32_cap & ARCH_CAP_BHI_NO) && + !cpu_matches(cpu_vuln_whitelist, NO_BHI) && + (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) || + boot_cpu_has(X86_FEATURE_HYPERVISOR))) + setup_force_cpu_bug(X86_BUG_BHI); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index a515328d9d7d..af5aa2c754c2 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -28,6 +28,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, + { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 }, { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 58ac8d69c94b..2f4e155080ba 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -52,7 +52,7 @@ enum kvm_only_cpuid_leafs { #define X86_FEATURE_IPRED_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 1) #define KVM_X86_FEATURE_RRSBA_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 2) #define X86_FEATURE_DDPD_U KVM_X86_FEATURE(CPUID_7_2_EDX, 3) -#define X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) +#define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) #define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) /* CPUID level 0x80000007 (EDX). */ @@ -128,6 +128,7 @@ static __always_inline u32 __feature_translate(int x86_feature) KVM_X86_TRANSLATE_FEATURE(CONSTANT_TSC); KVM_X86_TRANSLATE_FEATURE(PERFMON_V2); KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL); + KVM_X86_TRANSLATE_FEATURE(BHI_CTRL); default: return x86_feature; } diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 2bfbf758d061..f6986dee6f8c 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -275,6 +275,8 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL) call vmx_spec_ctrl_restore_host + CLEAR_BRANCH_HISTORY_VMEXIT + /* Put return value in AX */ mov %_ASM_BX, %_ASM_AX diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 47d9f03b7778..984ea2089efc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1621,7 +1621,7 @@ static bool kvm_is_immutable_feature_msr(u32 msr) ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ - ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR) + ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO) static u64 kvm_get_arch_capabilities(void) { diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/gm107.c b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/gm107.c index 7bcbc4895ec2..271bfa038f5b 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/gm107.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/gm107.c @@ -25,6 +25,7 @@ #include <subdev/bios.h> #include <subdev/bios/init.h> +#include <subdev/gsp.h> void gm107_devinit_disable(struct nvkm_devinit *init) @@ -33,10 +34,13 @@ gm107_devinit_disable(struct nvkm_devinit *init) u32 r021c00 = nvkm_rd32(device, 0x021c00); u32 r021c04 = nvkm_rd32(device, 0x021c04); - if (r021c00 & 0x00000001) - nvkm_subdev_disable(device, NVKM_ENGINE_CE, 0); - if (r021c00 & 0x00000004) - nvkm_subdev_disable(device, NVKM_ENGINE_CE, 2); + /* gsp only wants to enable/disable display */ + if (!nvkm_gsp_rm(device->gsp)) { + if (r021c00 & 0x00000001) + nvkm_subdev_disable(device, NVKM_ENGINE_CE, 0); + if (r021c00 & 0x00000004) + nvkm_subdev_disable(device, NVKM_ENGINE_CE, 2); + } if (r021c04 & 0x00000001) nvkm_subdev_disable(device, NVKM_ENGINE_DISP, 0); } diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c index 11b4c9c274a1..666eb93b1742 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c @@ -41,6 +41,7 @@ r535_devinit_new(const struct nvkm_devinit_func *hw, rm->dtor = r535_devinit_dtor; rm->post = hw->post; + rm->disable = hw->disable; ret = nv50_devinit_new_(rm, device, type, inst, pdevinit); if (ret) diff --git a/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c b/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c index 4c34344dc7dc..d7027d600208 100644 --- a/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c +++ b/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c @@ -50,12 +50,12 @@ static void mtk_vcodec_vpu_reset_dec_handler(void *priv) dev_err(&dev->plat_dev->dev, "Watchdog timeout!!"); - mutex_lock(&dev->dev_mutex); + mutex_lock(&dev->dev_ctx_lock); list_for_each_entry(ctx, &dev->ctx_list, list) { ctx->state = MTK_STATE_ABORT; mtk_v4l2_vdec_dbg(0, ctx, "[%d] Change to state MTK_STATE_ABORT", ctx->id); } - mutex_unlock(&dev->dev_mutex); + mutex_unlock(&dev->dev_ctx_lock); } static void mtk_vcodec_vpu_reset_enc_handler(void *priv) @@ -65,12 +65,12 @@ static void mtk_vcodec_vpu_reset_enc_handler(void *priv) dev_err(&dev->plat_dev->dev, "Watchdog timeout!!"); - mutex_lock(&dev->dev_mutex); + mutex_lock(&dev->dev_ctx_lock); list_for_each_entry(ctx, &dev->ctx_list, list) { ctx->state = MTK_STATE_ABORT; mtk_v4l2_vdec_dbg(0, ctx, "[%d] Change to state MTK_STATE_ABORT", ctx->id); } - mutex_unlock(&dev->dev_mutex); + mutex_unlock(&dev->dev_ctx_lock); } static const struct mtk_vcodec_fw_ops mtk_vcodec_vpu_msg = { diff --git a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c index f47c98faf068..2073781ccadb 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c @@ -268,7 +268,9 @@ static int fops_vcodec_open(struct file *file) ctx->dev->vdec_pdata->init_vdec_params(ctx); + mutex_lock(&dev->dev_ctx_lock); list_add(&ctx->list, &dev->ctx_list); + mutex_unlock(&dev->dev_ctx_lock); mtk_vcodec_dbgfs_create(ctx); mutex_unlock(&dev->dev_mutex); @@ -311,7 +313,9 @@ static int fops_vcodec_release(struct file *file) v4l2_ctrl_handler_free(&ctx->ctrl_hdl); mtk_vcodec_dbgfs_remove(dev, ctx->id); + mutex_lock(&dev->dev_ctx_lock); list_del_init(&ctx->list); + mutex_unlock(&dev->dev_ctx_lock); kfree(ctx); mutex_unlock(&dev->dev_mutex); return 0; @@ -404,6 +408,7 @@ static int mtk_vcodec_probe(struct platform_device *pdev) for (i = 0; i < MTK_VDEC_HW_MAX; i++) mutex_init(&dev->dec_mutex[i]); mutex_init(&dev->dev_mutex); + mutex_init(&dev->dev_ctx_lock); spin_lock_init(&dev->irqlock); snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name), "%s", diff --git a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h index 849b89dd205c..85b2c0d3d8bc 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h +++ b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h @@ -241,6 +241,7 @@ struct mtk_vcodec_dec_ctx { * * @dec_mutex: decoder hardware lock * @dev_mutex: video_device lock + * @dev_ctx_lock: the lock of context list * @decode_workqueue: decode work queue * * @irqlock: protect data access by irq handler and work thread @@ -282,6 +283,7 @@ struct mtk_vcodec_dec_dev { /* decoder hardware mutex lock */ struct mutex dec_mutex[MTK_VDEC_HW_MAX]; struct mutex dev_mutex; + struct mutex dev_ctx_lock; struct workqueue_struct *decode_workqueue; spinlock_t irqlock; diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c index 06ed47df693b..21836dd6ef85 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c @@ -869,7 +869,6 @@ static int vdec_hevc_slice_init(struct mtk_vcodec_dec_ctx *ctx) inst->vpu.codec_type = ctx->current_codec; inst->vpu.capture_type = ctx->capture_fourcc; - ctx->drv_handle = inst; err = vpu_dec_init(&inst->vpu); if (err) { mtk_vdec_err(ctx, "vdec_hevc init err=%d", err); @@ -898,6 +897,7 @@ static int vdec_hevc_slice_init(struct mtk_vcodec_dec_ctx *ctx) mtk_vdec_debug(ctx, "lat hevc instance >> %p, codec_type = 0x%x", inst, inst->vpu.codec_type); + ctx->drv_handle = inst; return 0; error_free_inst: kfree(inst); diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp8_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp8_if.c index 19407f9bc773..987b3d71b662 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp8_if.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp8_if.c @@ -449,7 +449,7 @@ static int vdec_vp8_decode(void *h_vdec, struct mtk_vcodec_mem *bs, inst->frm_cnt, y_fb_dma, c_fb_dma, fb); inst->cur_fb = fb; - dec->bs_dma = (unsigned long)bs->dma_addr; + dec->bs_dma = (uint64_t)bs->dma_addr; dec->bs_sz = bs->size; dec->cur_y_fb_dma = y_fb_dma; dec->cur_c_fb_dma = c_fb_dma; diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_if.c index 55355fa70090..039082f600c8 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_if.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_if.c @@ -16,6 +16,7 @@ #include "../vdec_drv_base.h" #include "../vdec_vpu_if.h" +#define VP9_MAX_SUPER_FRAMES_NUM 8 #define VP9_SUPER_FRAME_BS_SZ 64 #define MAX_VP9_DPB_SIZE 9 @@ -133,11 +134,11 @@ struct vp9_sf_ref_fb { */ struct vdec_vp9_vsi { unsigned char sf_bs_buf[VP9_SUPER_FRAME_BS_SZ]; - struct vp9_sf_ref_fb sf_ref_fb[VP9_MAX_FRM_BUF_NUM-1]; + struct vp9_sf_ref_fb sf_ref_fb[VP9_MAX_SUPER_FRAMES_NUM]; int sf_next_ref_fb_idx; unsigned int sf_frm_cnt; - unsigned int sf_frm_offset[VP9_MAX_FRM_BUF_NUM-1]; - unsigned int sf_frm_sz[VP9_MAX_FRM_BUF_NUM-1]; + unsigned int sf_frm_offset[VP9_MAX_SUPER_FRAMES_NUM]; + unsigned int sf_frm_sz[VP9_MAX_SUPER_FRAMES_NUM]; unsigned int sf_frm_idx; unsigned int sf_init; struct vdec_fb fb; @@ -526,7 +527,7 @@ static void vp9_swap_frm_bufs(struct vdec_vp9_inst *inst) /* if this super frame and it is not last sub-frame, get next fb for * sub-frame decode */ - if (vsi->sf_frm_cnt > 0 && vsi->sf_frm_idx != vsi->sf_frm_cnt - 1) + if (vsi->sf_frm_cnt > 0 && vsi->sf_frm_idx != vsi->sf_frm_cnt) vsi->sf_next_ref_fb_idx = vp9_get_sf_ref_fb(inst); } @@ -735,7 +736,7 @@ static void get_free_fb(struct vdec_vp9_inst *inst, struct vdec_fb **out_fb) static int validate_vsi_array_indexes(struct vdec_vp9_inst *inst, struct vdec_vp9_vsi *vsi) { - if (vsi->sf_frm_idx >= VP9_MAX_FRM_BUF_NUM - 1) { + if (vsi->sf_frm_idx > VP9_MAX_SUPER_FRAMES_NUM) { mtk_vdec_err(inst->ctx, "Invalid vsi->sf_frm_idx=%u.", vsi->sf_frm_idx); return -EIO; } diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c index cf48d09b78d7..eea709d93820 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c @@ -1074,7 +1074,7 @@ static int vdec_vp9_slice_setup_tile_buffer(struct vdec_vp9_slice_instance *inst unsigned int mi_row; unsigned int mi_col; unsigned int offset; - unsigned int pa; + dma_addr_t pa; unsigned int size; struct vdec_vp9_slice_tiles *tiles; unsigned char *pos; @@ -1109,7 +1109,7 @@ static int vdec_vp9_slice_setup_tile_buffer(struct vdec_vp9_slice_instance *inst pos = va + offset; end = va + bs->size; /* truncated */ - pa = (unsigned int)bs->dma_addr + offset; + pa = bs->dma_addr + offset; tb = instance->tile.va; for (i = 0; i < rows; i++) { for (j = 0; j < cols; j++) { diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c index 82e57ae983d5..da6be556727b 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c @@ -77,12 +77,14 @@ static bool vpu_dec_check_ap_inst(struct mtk_vcodec_dec_dev *dec_dev, struct vde struct mtk_vcodec_dec_ctx *ctx; int ret = false; + mutex_lock(&dec_dev->dev_ctx_lock); list_for_each_entry(ctx, &dec_dev->ctx_list, list) { if (!IS_ERR_OR_NULL(ctx) && ctx->vpu_inst == vpu) { ret = true; break; } } + mutex_unlock(&dec_dev->dev_ctx_lock); return ret; } diff --git a/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c b/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c index 6319f24bc714..3cb8a1622222 100644 --- a/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c +++ b/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c @@ -177,7 +177,9 @@ static int fops_vcodec_open(struct file *file) mtk_v4l2_venc_dbg(2, ctx, "Create instance [%d]@%p m2m_ctx=%p ", ctx->id, ctx, ctx->m2m_ctx); + mutex_lock(&dev->dev_ctx_lock); list_add(&ctx->list, &dev->ctx_list); + mutex_unlock(&dev->dev_ctx_lock); mutex_unlock(&dev->dev_mutex); mtk_v4l2_venc_dbg(0, ctx, "%s encoder [%d]", dev_name(&dev->plat_dev->dev), @@ -212,7 +214,9 @@ static int fops_vcodec_release(struct file *file) v4l2_fh_exit(&ctx->fh); v4l2_ctrl_handler_free(&ctx->ctrl_hdl); + mutex_lock(&dev->dev_ctx_lock); list_del_init(&ctx->list); + mutex_unlock(&dev->dev_ctx_lock); kfree(ctx); mutex_unlock(&dev->dev_mutex); return 0; @@ -294,6 +298,7 @@ static int mtk_vcodec_probe(struct platform_device *pdev) mutex_init(&dev->enc_mutex); mutex_init(&dev->dev_mutex); + mutex_init(&dev->dev_ctx_lock); spin_lock_init(&dev->irqlock); snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name), "%s", diff --git a/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.h b/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.h index a042f607ed8d..0bd85d0fb379 100644 --- a/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.h +++ b/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.h @@ -178,6 +178,7 @@ struct mtk_vcodec_enc_ctx { * * @enc_mutex: encoder hardware lock. * @dev_mutex: video_device lock + * @dev_ctx_lock: the lock of context list * @encode_workqueue: encode work queue * * @enc_irq: h264 encoder irq resource @@ -205,6 +206,7 @@ struct mtk_vcodec_enc_dev { /* encoder hardware mutex lock */ struct mutex enc_mutex; struct mutex dev_mutex; + struct mutex dev_ctx_lock; struct workqueue_struct *encode_workqueue; int enc_irq; diff --git a/drivers/media/platform/mediatek/vcodec/encoder/venc_vpu_if.c b/drivers/media/platform/mediatek/vcodec/encoder/venc_vpu_if.c index 84ad1cc6ad17..51bb7ee141b9 100644 --- a/drivers/media/platform/mediatek/vcodec/encoder/venc_vpu_if.c +++ b/drivers/media/platform/mediatek/vcodec/encoder/venc_vpu_if.c @@ -47,12 +47,14 @@ static bool vpu_enc_check_ap_inst(struct mtk_vcodec_enc_dev *enc_dev, struct ven struct mtk_vcodec_enc_ctx *ctx; int ret = false; + mutex_lock(&enc_dev->dev_ctx_lock); list_for_each_entry(ctx, &enc_dev->ctx_list, list) { if (!IS_ERR_OR_NULL(ctx) && ctx->vpu_inst == vpu) { ret = true; break; } } + mutex_unlock(&enc_dev->dev_ctx_lock); return ret; } diff --git a/drivers/platform/chrome/cros_ec_uart.c b/drivers/platform/chrome/cros_ec_uart.c index 8ea867c2a01a..62bc24f6dcc7 100644 --- a/drivers/platform/chrome/cros_ec_uart.c +++ b/drivers/platform/chrome/cros_ec_uart.c @@ -263,12 +263,6 @@ static int cros_ec_uart_probe(struct serdev_device *serdev) if (!ec_dev) return -ENOMEM; - ret = devm_serdev_device_open(dev, serdev); - if (ret) { - dev_err(dev, "Unable to open UART device"); - return ret; - } - serdev_device_set_drvdata(serdev, ec_dev); init_waitqueue_head(&ec_uart->response.wait_queue); @@ -280,14 +274,6 @@ static int cros_ec_uart_probe(struct serdev_device *serdev) return ret; } - ret = serdev_device_set_baudrate(serdev, ec_uart->baudrate); - if (ret < 0) { - dev_err(dev, "Failed to set up host baud rate (%d)", ret); - return ret; - } - - serdev_device_set_flow_control(serdev, ec_uart->flowcontrol); - /* Initialize ec_dev for cros_ec */ ec_dev->phys_name = dev_name(dev); ec_dev->dev = dev; @@ -301,6 +287,20 @@ static int cros_ec_uart_probe(struct serdev_device *serdev) serdev_device_set_client_ops(serdev, &cros_ec_uart_client_ops); + ret = devm_serdev_device_open(dev, serdev); + if (ret) { + dev_err(dev, "Unable to open UART device"); + return ret; + } + + ret = serdev_device_set_baudrate(serdev, ec_uart->baudrate); + if (ret < 0) { + dev_err(dev, "Failed to set up host baud rate (%d)", ret); + return ret; + } + + serdev_device_set_flow_control(serdev, ec_uart->flowcontrol); + return cros_ec_register(ec_dev); } diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c index ee2e164f86b9..38c932df6446 100644 --- a/drivers/platform/x86/acer-wmi.c +++ b/drivers/platform/x86/acer-wmi.c @@ -598,6 +598,15 @@ static const struct dmi_system_id acer_quirks[] __initconst = { .driver_data = &quirk_acer_predator_v4, }, { + .callback = dmi_matched, + .ident = "Acer Predator PH18-71", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "Predator PH18-71"), + }, + .driver_data = &quirk_acer_predator_v4, + }, + { .callback = set_force_caps, .ident = "Acer Aspire Switch 10E SW3-016", .matches = { diff --git a/drivers/platform/x86/intel/hid.c b/drivers/platform/x86/intel/hid.c index 7457ca2b27a6..c7a827645864 100644 --- a/drivers/platform/x86/intel/hid.c +++ b/drivers/platform/x86/intel/hid.c @@ -49,6 +49,8 @@ static const struct acpi_device_id intel_hid_ids[] = { {"INTC1076", 0}, {"INTC1077", 0}, {"INTC1078", 0}, + {"INTC107B", 0}, + {"INTC10CB", 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, intel_hid_ids); @@ -504,6 +506,7 @@ static void notify_handler(acpi_handle handle, u32 event, void *context) struct platform_device *device = context; struct intel_hid_priv *priv = dev_get_drvdata(&device->dev); unsigned long long ev_index; + struct key_entry *ke; int err; /* @@ -545,11 +548,15 @@ static void notify_handler(acpi_handle handle, u32 event, void *context) if (event == 0xc0 || !priv->array) return; - if (!sparse_keymap_entry_from_scancode(priv->array, event)) { + ke = sparse_keymap_entry_from_scancode(priv->array, event); + if (!ke) { dev_info(&device->dev, "unknown event 0x%x\n", event); return; } + if (ke->type == KE_IGNORE) + return; + wakeup: pm_wakeup_hard_event(&device->dev); diff --git a/drivers/platform/x86/intel/vbtn.c b/drivers/platform/x86/intel/vbtn.c index 084c355c86f5..79bb2c801daa 100644 --- a/drivers/platform/x86/intel/vbtn.c +++ b/drivers/platform/x86/intel/vbtn.c @@ -136,8 +136,6 @@ static int intel_vbtn_input_setup(struct platform_device *device) priv->switches_dev->id.bustype = BUS_HOST; if (priv->has_switches) { - detect_tablet_mode(&device->dev); - ret = input_register_device(priv->switches_dev); if (ret) return ret; @@ -258,9 +256,6 @@ static const struct dmi_system_id dmi_switches_allow_list[] = { static bool intel_vbtn_has_switches(acpi_handle handle, bool dual_accel) { - unsigned long long vgbs; - acpi_status status; - /* See dual_accel_detect.h for more info */ if (dual_accel) return false; @@ -268,8 +263,7 @@ static bool intel_vbtn_has_switches(acpi_handle handle, bool dual_accel) if (!dmi_check_system(dmi_switches_allow_list)) return false; - status = acpi_evaluate_integer(handle, "VGBS", NULL, &vgbs); - return ACPI_SUCCESS(status); + return acpi_has_method(handle, "VGBS"); } static int intel_vbtn_probe(struct platform_device *device) @@ -316,6 +310,9 @@ static int intel_vbtn_probe(struct platform_device *device) if (ACPI_FAILURE(status)) dev_err(&device->dev, "Error VBDL failed with ACPI status %d\n", status); } + // Check switches after buttons since VBDL may have side effects. + if (has_switches) + detect_tablet_mode(&device->dev); device_init_wakeup(&device->dev, true); /* diff --git a/drivers/platform/x86/lg-laptop.c b/drivers/platform/x86/lg-laptop.c index ad3c39e9e9f5..e714ee6298dd 100644 --- a/drivers/platform/x86/lg-laptop.c +++ b/drivers/platform/x86/lg-laptop.c @@ -736,7 +736,7 @@ static int acpi_add(struct acpi_device *device) default: year = 2019; } - pr_info("product: %s year: %d\n", product, year); + pr_info("product: %s year: %d\n", product ?: "unknown", year); if (year >= 2019) battery_limit_use_wmbb = 1; diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 291f14ef6702..77244c9aa60d 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -264,6 +264,7 @@ static const struct key_entry toshiba_acpi_keymap[] = { { KE_KEY, 0xb32, { KEY_NEXTSONG } }, { KE_KEY, 0xb33, { KEY_PLAYPAUSE } }, { KE_KEY, 0xb5a, { KEY_MEDIA } }, + { KE_IGNORE, 0x0e00, { KEY_RESERVED } }, /* Wake from sleep */ { KE_IGNORE, 0x1430, { KEY_RESERVED } }, /* Wake from sleep */ { KE_IGNORE, 0x1501, { KEY_RESERVED } }, /* Output changed */ { KE_IGNORE, 0x1502, { KEY_RESERVED } }, /* HDMI plugged/unplugged */ @@ -3523,9 +3524,10 @@ static void toshiba_acpi_notify(struct acpi_device *acpi_dev, u32 event) (dev->kbd_mode == SCI_KBD_MODE_ON) ? LED_FULL : LED_OFF); break; + case 0x8e: /* Power button pressed */ + break; case 0x85: /* Unknown */ case 0x8d: /* Unknown */ - case 0x8e: /* Unknown */ case 0x94: /* Unknown */ case 0x95: /* Unknown */ default: diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 3640f417cce1..5c180fdc3efb 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -281,7 +281,6 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap, struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; - struct bkey_s_c_xattr xattr; struct posix_acl *acl = NULL; struct bkey_s_c k; int ret; @@ -290,28 +289,27 @@ retry: ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, inode_inum(inode), &search, 0); - if (ret) { - if (!bch2_err_matches(ret, ENOENT)) - acl = ERR_PTR(ret); - goto out; - } + if (ret) + goto err; k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); - if (ret) { - acl = ERR_PTR(ret); - goto out; - } + if (ret) + goto err; - xattr = bkey_s_c_to_xattr(k); + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), - le16_to_cpu(xattr.v->x_val_len)); + le16_to_cpu(xattr.v->x_val_len)); + ret = PTR_ERR_OR_ZERO(acl); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; - if (!IS_ERR(acl)) + if (ret) + acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL; + + if (!IS_ERR_OR_NULL(acl)) set_cached_acl(&inode->v, type, acl); -out: - if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart)) - goto retry; bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 63102992d955..364ae42022af 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1535,6 +1535,20 @@ enum btree_id { BTREE_ID_NR }; +static inline bool btree_id_is_alloc(enum btree_id id) +{ + switch (id) { + case BTREE_ID_alloc: + case BTREE_ID_backpointers: + case BTREE_ID_need_discard: + case BTREE_ID_freespace: + case BTREE_ID_bucket_gens: + return true; + default: + return false; + } +} + #define BTREE_MAX_DEPTH 4U /* Btree nodes */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 6280da1244b5..d2555da55c6d 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -368,11 +368,16 @@ again: buf.buf)) { bch2_btree_node_evict(trans, cur_k.k); cur = NULL; - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: - bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); if (ret) break; + + if (!btree_id_is_alloc(b->c.btree_id)) { + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + if (ret) + break; + } continue; } @@ -544,12 +549,12 @@ reconstruct_root: bch2_btree_root_alloc_fake(c, i, 0); } else { bch2_btree_root_alloc_fake(c, i, 1); + bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); if (ret) break; } - bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); reconstructed_root = true; } diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 24772538e4cc..1d58d447b386 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -642,7 +642,7 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *); static inline int btree_trans_too_many_iters(struct btree_trans *trans) { - if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8) + if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8) return __bch2_btree_trans_too_many_iters(trans); return 0; diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 5cbcbfe85235..1e8cf49a6935 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -130,12 +130,30 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); } +static void journal_iter_verify(struct journal_iter *iter) +{ + struct journal_keys *keys = iter->keys; + size_t gap_size = keys->size - keys->nr; + + BUG_ON(iter->idx >= keys->gap && + iter->idx < keys->gap + gap_size); + + if (iter->idx < keys->size) { + struct journal_key *k = keys->data + iter->idx; + + int cmp = cmp_int(k->btree_id, iter->btree_id) ?: + cmp_int(k->level, iter->level); + BUG_ON(cmp < 0); + } +} + static void journal_iters_fix(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; /* The key we just inserted is immediately before the gap: */ size_t gap_end = keys->gap + (keys->size - keys->nr); - struct btree_and_journal_iter *iter; + struct journal_key *new_key = &keys->data[keys->gap - 1]; + struct journal_iter *iter; /* * If an iterator points one after the key we just inserted, decrement @@ -143,9 +161,14 @@ static void journal_iters_fix(struct bch_fs *c) * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will * handle that: */ - list_for_each_entry(iter, &c->journal_iters, journal.list) - if (iter->journal.idx == gap_end) - iter->journal.idx = keys->gap - 1; + list_for_each_entry(iter, &c->journal_iters, list) { + journal_iter_verify(iter); + if (iter->idx == gap_end && + new_key->btree_id == iter->btree_id && + new_key->level == iter->level) + iter->idx = keys->gap - 1; + journal_iter_verify(iter); + } } static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) @@ -192,7 +215,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, if (idx > keys->gap) idx -= keys->size - keys->nr; + size_t old_gap = keys->gap; + if (keys->nr == keys->size) { + journal_iters_move_gap(c, old_gap, keys->size); + old_gap = keys->size; + struct journal_keys new_keys = { .nr = keys->nr, .size = max_t(size_t, keys->size, 8) * 2, @@ -216,7 +244,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, keys->gap = keys->nr; } - journal_iters_move_gap(c, keys->gap, idx); + journal_iters_move_gap(c, old_gap, idx); move_gap(keys, idx); @@ -301,16 +329,21 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) { - struct journal_key *k = iter->keys->data + iter->idx; + journal_iter_verify(iter); + + while (iter->idx < iter->keys->size) { + struct journal_key *k = iter->keys->data + iter->idx; + + int cmp = cmp_int(k->btree_id, iter->btree_id) ?: + cmp_int(k->level, iter->level); + if (cmp > 0) + break; + BUG_ON(cmp); - while (k < iter->keys->data + iter->keys->size && - k->btree_id == iter->btree_id && - k->level == iter->level) { if (!k->overwritten) return bkey_i_to_s_c(k->k); bch2_journal_iter_advance(iter); - k = iter->keys->data + iter->idx; } return bkey_s_c_null; @@ -330,6 +363,8 @@ static void bch2_journal_iter_init(struct bch_fs *c, iter->level = level; iter->keys = &c->journal_keys; iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); + + journal_iter_verify(iter); } static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) @@ -434,10 +469,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, iter->trans = trans; iter->b = b; iter->node_iter = node_iter; - bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); - INIT_LIST_HEAD(&iter->journal.list); iter->pos = b->data->min_key; iter->at_end = false; + INIT_LIST_HEAD(&iter->journal.list); + + if (trans->journal_replay_not_finished) { + bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); + if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags)) + list_add(&iter->journal.list, &trans->c->journal_iters); + } } /* @@ -452,9 +492,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, bch2_btree_node_iter_init_from_start(&node_iter, b); __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); - if (trans->journal_replay_not_finished && - !test_bit(BCH_FS_may_go_rw, &trans->c->flags)) - list_add(&iter->journal.list, &trans->c->journal_iters); } /* sort and dedup all keys in the journal: */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 581edcb0911b..88a3582a3275 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -169,6 +169,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, } else { mutex_lock(&bc->lock); list_move_tail(&ck->list, &bc->freed_pcpu); + bc->nr_freed_pcpu++; mutex_unlock(&bc->lock); } } @@ -245,6 +246,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, if (!list_empty(&bc->freed_pcpu)) { ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); list_del_init(&ck->list); + bc->nr_freed_pcpu--; } mutex_unlock(&bc->lock); } @@ -659,7 +661,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, commit_flags |= BCH_WATERMARK_reclaim; if (ck->journal.seq != journal_last_seq(j) || - j->watermark == BCH_WATERMARK_stripe) + !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) commit_flags |= BCH_TRANS_COMMIT_no_journal_res; ret = bch2_btree_iter_traverse(&b_iter) ?: diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index b9b151e693ed..f2caf491957e 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -440,33 +440,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, struct btree_path *path, struct btree_bkey_cached_common *b) { - struct btree_path *linked; - unsigned i, iter; - int ret; - - /* - * XXX BIG FAT NOTICE - * - * Drop all read locks before taking a write lock: - * - * This is a hack, because bch2_btree_node_lock_write_nofail() is a - * hack - but by dropping read locks first, this should never fail, and - * we only use this in code paths where whatever read locks we've - * already taken are no longer needed: - */ - - trans_for_each_path(trans, linked, iter) { - if (!linked->nodes_locked) - continue; - - for (i = 0; i < BTREE_MAX_DEPTH; i++) - if (btree_node_read_locked(linked, i)) { - btree_node_unlock(trans, linked, i); - btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK); - } - } - - ret = __btree_node_lock_write(trans, path, b, true); + int ret = __btree_node_lock_write(trans, path, b, true); BUG_ON(ret); } diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 3f33be7e5e5c..556f76f5c84e 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -133,6 +133,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, if (le64_to_cpu(bn->magic) != bset_magic(c)) return; + if (btree_id_is_alloc(BTREE_NODE_ID(bn))) + return; + rcu_read_lock(); struct found_btree_node n = { .btree_id = BTREE_NODE_ID(bn), @@ -213,6 +216,9 @@ static int read_btree_nodes(struct find_btree_nodes *f) closure_init_stack(&cl); for_each_online_member(c, ca) { + if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) + continue; + struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); struct task_struct *t; @@ -290,7 +296,7 @@ again: found_btree_node_to_text(&buf, c, n); bch_err(c, "%s", buf.buf); printbuf_exit(&buf); - return -1; + return -BCH_ERR_fsck_repair_unimplemented; } } @@ -436,6 +442,9 @@ bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, unsigned level, struct bpos node_min, struct bpos node_max) { + if (btree_id_is_alloc(btree)) + return 0; + struct find_btree_nodes *f = &c->found_btree_nodes; int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 9404d96c38f3..e0c982a4195c 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -364,7 +364,21 @@ struct btree_insert_entry { unsigned long ip_allocated; }; +/* Number of btree paths we preallocate, usually enough */ #define BTREE_ITER_INITIAL 64 +/* + * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code + * paths should run inside this limit, and if they don't it usually indicates a + * bug (leaking/duplicated btree paths). + * + * exception: some fsck paths + * + * bugs with excessive path usage seem to have possibly been eliminated now, so + * we might consider eliminating this (and btree_trans_too_many_iter()) at some + * point. + */ +#define BTREE_ITER_NORMAL_LIMIT 256 +/* never exceed limit */ #define BTREE_ITER_MAX (1U << 10) struct btree_trans_commit_hook; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 32397b99752f..c4a5e83a56a4 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -26,9 +26,9 @@ #include <linux/random.h> -const char * const bch2_btree_update_modes[] = { +static const char * const bch2_btree_update_modes[] = { #define x(t) #t, - BCH_WATERMARKS() + BTREE_UPDATE_MODES() #undef x NULL }; @@ -704,9 +704,13 @@ static void btree_update_nodes_written(struct btree_update *as) bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, "%s", bch2_err_str(ret)); err: - if (as->b) { - - b = as->b; + /* + * We have to be careful because another thread might be getting ready + * to free as->b and calling btree_update_reparent() on us - we'll + * recheck under btree_update_lock below: + */ + b = READ_ONCE(as->b); + if (b) { btree_path_idx_t path_idx = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p); struct btree_path *path = trans->paths + path_idx; @@ -850,15 +854,17 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) { struct bch_fs *c = as->c; - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - BUG_ON(as->mode != BTREE_UPDATE_none); + BUG_ON(as->update_level_end < b->c.level); BUG_ON(!btree_node_dirty(b)); BUG_ON(!b->c.level); + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + as->mode = BTREE_UPDATE_node; as->b = b; + as->update_level_end = b->c.level; set_btree_node_write_blocked(b); list_add(&as->write_blocked_list, &b->write_blocked); @@ -1100,7 +1106,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * static struct btree_update * bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - unsigned level, bool split, unsigned flags) + unsigned level_start, bool split, unsigned flags) { struct bch_fs *c = trans->c; struct btree_update *as; @@ -1108,7 +1114,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc) ? BCH_DISK_RESERVATION_NOFAIL : 0; unsigned nr_nodes[2] = { 0, 0 }; - unsigned update_level = level; + unsigned level_end = level_start; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; int ret = 0; u32 restart_count = trans->restart_count; @@ -1123,34 +1129,30 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags &= ~BCH_WATERMARK_MASK; flags |= watermark; - if (watermark < c->journal.watermark) { - struct journal_res res = { 0 }; - unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK; + if (watermark < BCH_WATERMARK_reclaim && + test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) { + if (flags & BCH_TRANS_COMMIT_journal_reclaim) + return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock); - if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark < BCH_WATERMARK_reclaim) - journal_flags |= JOURNAL_RES_GET_NONBLOCK; - - ret = drop_locks_do(trans, - bch2_journal_res_get(&c->journal, &res, 1, journal_flags)); - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) - ret = -BCH_ERR_journal_reclaim_would_deadlock; + bch2_trans_unlock(trans); + wait_event(c->journal.wait, !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)); + ret = bch2_trans_relock(trans); if (ret) return ERR_PTR(ret); } while (1) { - nr_nodes[!!update_level] += 1 + split; - update_level++; + nr_nodes[!!level_end] += 1 + split; + level_end++; - ret = bch2_btree_path_upgrade(trans, path, update_level + 1); + ret = bch2_btree_path_upgrade(trans, path, level_end + 1); if (ret) return ERR_PTR(ret); - if (!btree_path_node(path, update_level)) { + if (!btree_path_node(path, level_end)) { /* Allocating new root? */ nr_nodes[1] += split; - update_level = BTREE_MAX_DEPTH; + level_end = BTREE_MAX_DEPTH; break; } @@ -1158,11 +1160,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, * Always check for space for two keys, even if we won't have to * split at prior level - it might have been a merge instead: */ - if (bch2_btree_node_insert_fits(path->l[update_level].b, + if (bch2_btree_node_insert_fits(path->l[level_end].b, BKEY_BTREE_PTR_U64s_MAX * 2)) break; - split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); + split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); } if (!down_read_trylock(&c->gc_lock)) { @@ -1176,14 +1178,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS); memset(as, 0, sizeof(*as)); closure_init(&as->cl, NULL); - as->c = c; - as->start_time = start_time; - as->ip_started = _RET_IP_; - as->mode = BTREE_UPDATE_none; - as->watermark = watermark; - as->took_gc_lock = true; - as->btree_id = path->btree_id; - as->update_level = update_level; + as->c = c; + as->start_time = start_time; + as->ip_started = _RET_IP_; + as->mode = BTREE_UPDATE_none; + as->watermark = watermark; + as->took_gc_lock = true; + as->btree_id = path->btree_id; + as->update_level_start = level_start; + as->update_level_end = level_end; INIT_LIST_HEAD(&as->list); INIT_LIST_HEAD(&as->unwritten_list); INIT_LIST_HEAD(&as->write_blocked_list); @@ -1373,12 +1376,12 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, } static void -__bch2_btree_insert_keys_interior(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter node_iter, - struct keylist *keys) +bch2_btree_insert_keys_interior(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter node_iter, + struct keylist *keys) { struct bkey_i *insert = bch2_keylist_front(keys); struct bkey_packed *k; @@ -1534,7 +1537,7 @@ static void btree_split_insert_keys(struct btree_update *as, bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); - __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); + bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); BUG_ON(bch2_btree_node_check_topology(trans, b)); } @@ -1714,27 +1717,6 @@ err: goto out; } -static void -bch2_btree_insert_keys_interior(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct keylist *keys) -{ - struct btree_path *linked; - unsigned i; - - __bch2_btree_insert_keys_interior(as, trans, path, b, - path->l[b->c.level].iter, keys); - - btree_update_updated_node(as, b); - - trans_for_each_path_with_node(trans, b, linked, i) - bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); - - bch2_trans_verify_paths(trans); -} - /** * bch2_btree_insert_node - insert bkeys into a given btree node * @@ -1755,7 +1737,8 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t struct keylist *keys) { struct bch_fs *c = as->c; - struct btree_path *path = trans->paths + path_idx; + struct btree_path *path = trans->paths + path_idx, *linked; + unsigned i; int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; @@ -1784,7 +1767,13 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t return ret; } - bch2_btree_insert_keys_interior(as, trans, path, b, keys); + bch2_btree_insert_keys_interior(as, trans, path, b, + path->l[b->c.level].iter, keys); + + trans_for_each_path_with_node(trans, b, linked, i) + bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); + + bch2_trans_verify_paths(trans); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; @@ -1798,6 +1787,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t bch2_maybe_compact_whiteouts(c, b)) bch2_trans_node_reinit_iter(trans, b); + btree_update_updated_node(as, b); bch2_btree_node_unlock_write(trans, path, b); BUG_ON(bch2_btree_node_check_topology(trans, b)); @@ -1807,7 +1797,7 @@ split: * We could attempt to avoid the transaction restart, by calling * bch2_btree_path_upgrade() and allocating more nodes: */ - if (b->c.level >= as->update_level) { + if (b->c.level >= as->update_level_end) { trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b); return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); } @@ -2519,9 +2509,11 @@ void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned lev static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) { - prt_printf(out, "%ps: btree=%s watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + prt_printf(out, "%ps: btree=%s l=%u-%u watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", (void *) as->ip_started, bch2_btree_id_str(as->btree_id), + as->update_level_start, + as->update_level_end, bch2_watermarks[as->watermark], bch2_btree_update_modes[as->mode], as->nodes_written, diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 88dcf5a22a3b..c1a479ebaad1 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -57,7 +57,8 @@ struct btree_update { unsigned took_gc_lock:1; enum btree_id btree_id; - unsigned update_level; + unsigned update_level_start; + unsigned update_level_end; struct disk_reservation disk_res; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index cbfa6459bdbc..72781aad6ba7 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -134,42 +134,38 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg struct fsck_thread { struct thread_with_stdio thr; struct bch_fs *c; - char **devs; - size_t nr_devs; struct bch_opts opts; }; static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) { struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); - if (thr->devs) - for (size_t i = 0; i < thr->nr_devs; i++) - kfree(thr->devs[i]); - kfree(thr->devs); kfree(thr); } static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) { struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); - struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts); - - if (IS_ERR(c)) - return PTR_ERR(c); + struct bch_fs *c = thr->c; - int ret = 0; - if (test_bit(BCH_FS_errors_fixed, &c->flags)) - ret |= 1; - if (test_bit(BCH_FS_error, &c->flags)) - ret |= 4; + int ret = PTR_ERR_OR_ZERO(c); + if (ret) + return ret; - bch2_fs_stop(c); + ret = bch2_fs_start(thr->c); + if (ret) + goto err; - if (ret & 1) + if (test_bit(BCH_FS_errors_fixed, &c->flags)) { bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); - if (ret & 4) + ret |= 1; + } + if (test_bit(BCH_FS_error, &c->flags)) { bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); - + ret |= 4; + } +err: + bch2_fs_stop(c); return ret; } @@ -182,7 +178,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a { struct bch_ioctl_fsck_offline arg; struct fsck_thread *thr = NULL; - u64 *devs = NULL; + darray_str(devs) = {}; long ret = 0; if (copy_from_user(&arg, user_arg, sizeof(arg))) @@ -194,29 +190,32 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) || - !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) || - !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) { - ret = -ENOMEM; - goto err; - } + for (size_t i = 0; i < arg.nr_devs; i++) { + u64 dev_u64; + ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); + if (ret) + goto err; - thr->opts = bch2_opts_empty(); - thr->nr_devs = arg.nr_devs; + char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); + ret = PTR_ERR_OR_ZERO(dev_str); + if (ret) + goto err; - if (copy_from_user(devs, &user_arg->devs[0], - array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) { - ret = -EINVAL; - goto err; + ret = darray_push(&devs, dev_str); + if (ret) { + kfree(dev_str); + goto err; + } } - for (size_t i = 0; i < arg.nr_devs; i++) { - thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX); - ret = PTR_ERR_OR_ZERO(thr->devs[i]); - if (ret) - goto err; + thr = kzalloc(sizeof(*thr), GFP_KERNEL); + if (!thr) { + ret = -ENOMEM; + goto err; } + thr->opts = bch2_opts_empty(); + if (arg.opts) { char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); @@ -230,15 +229,26 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); + /* We need request_key() to be called before we punt to kthread: */ + opt_set(thr->opts, nostart, true); + + thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); + + if (!IS_ERR(thr->c) && + thr->c->opts.errors == BCH_ON_ERROR_panic) + thr->c->opts.errors = BCH_ON_ERROR_ro; + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops); -err: - if (ret < 0) { - if (thr) - bch2_fsck_thread_exit(&thr->thr); - pr_err("ret %s", bch2_err_str(ret)); - } - kfree(devs); +out: + darray_for_each(devs, i) + kfree(*i); + darray_exit(&devs); return ret; +err: + if (thr) + bch2_fsck_thread_exit(&thr->thr); + pr_err("ret %s", bch2_err_str(ret)); + goto out; } static long bch2_global_ioctl(unsigned cmd, void __user *arg) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 34731ee0217f..0022b51ce3c0 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -598,6 +598,8 @@ int bch2_data_update_init(struct btree_trans *trans, i++; } + unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); + /* * If current extent durability is less than io_opts.data_replicas, * we're not trying to rereplicate the extent up to data_replicas here - @@ -607,7 +609,7 @@ int bch2_data_update_init(struct btree_trans *trans, * rereplicate, currently, so that users don't get an unexpected -ENOSPC */ if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) && - durability_have >= io_opts.data_replicas) { + !durability_required) { m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; m->data_opts.rewrite_ptrs = 0; /* if iter == NULL, it's just a promote */ @@ -616,11 +618,18 @@ int bch2_data_update_init(struct btree_trans *trans, goto done; } - m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) + + m->op.nr_replicas = min(durability_removing, durability_required) + m->data_opts.extra_replicas; - m->op.nr_replicas_required = m->op.nr_replicas; - BUG_ON(!m->op.nr_replicas); + /* + * If device(s) were set to durability=0 after data was written to them + * we can end up with a duribilty=0 extent, and the normal algorithm + * that tries not to increase durability doesn't work: + */ + if (!(durability_have + durability_removing)) + m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); + + m->op.nr_replicas_required = m->op.nr_replicas; if (reserve_sectors) { ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 208ce6f0fc43..cd99b7399414 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -13,6 +13,7 @@ #include "btree_iter.h" #include "btree_locking.h" #include "btree_update.h" +#include "btree_update_interior.h" #include "buckets.h" #include "debug.h" #include "error.h" @@ -668,7 +669,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, i->size = size; i->ret = 0; - do { + while (1) { err = flush_buf(i); if (err) return err; @@ -676,9 +677,12 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, if (!i->size) break; + if (done) + break; + done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter); i->iter++; - } while (!done); + } if (i->buf.allocation_failure) return -ENOMEM; @@ -693,13 +697,45 @@ static const struct file_operations journal_pins_ops = { .read = bch2_journal_pins_read, }; +static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + int err; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + if (!i->iter) { + bch2_btree_updates_to_text(&i->buf, c); + i->iter++; + } + + err = flush_buf(i); + if (err) + return err; + + if (i->buf.allocation_failure) + return -ENOMEM; + + return i->ret; +} + +static const struct file_operations btree_updates_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_btree_updates_read, +}; + static int btree_transaction_stats_open(struct inode *inode, struct file *file) { struct bch_fs *c = inode->i_private; struct dump_iter *i; i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); - if (!i) return -ENOMEM; @@ -866,6 +902,20 @@ void bch2_fs_debug_exit(struct bch_fs *c) debugfs_remove_recursive(c->fs_debug_dir); } +static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd) +{ + struct dentry *d; + + d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir); + + debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops); + + debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops); + + debugfs_create_file("bfloat-failed", 0400, d, bd, + &bfloat_failed_debug_ops); +} + void bch2_fs_debug_init(struct bch_fs *c) { struct btree_debug *bd; @@ -888,6 +938,9 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, c->btree_debug, &journal_pins_ops); + debugfs_create_file("btree_updates", 0400, c->fs_debug_dir, + c->btree_debug, &btree_updates_ops); + debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir, c, &btree_transaction_stats_op); @@ -902,21 +955,7 @@ void bch2_fs_debug_init(struct bch_fs *c) bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); bd++) { bd->id = bd - c->btree_debug; - debugfs_create_file(bch2_btree_id_str(bd->id), - 0400, c->btree_debug_dir, bd, - &btree_debug_ops); - - snprintf(name, sizeof(name), "%s-formats", - bch2_btree_id_str(bd->id)); - - debugfs_create_file(name, 0400, c->btree_debug_dir, bd, - &btree_format_debug_ops); - - snprintf(name, sizeof(name), "%s-bfloat-failed", - bch2_btree_id_str(bd->id)); - - debugfs_create_file(name, 0400, c->btree_debug_dir, bd, - &bfloat_failed_debug_ops); + bch2_fs_debug_btree_init(c, bd); } } diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c index 4ce5e957a6e9..0f955c3c76a7 100644 --- a/fs/bcachefs/eytzinger.c +++ b/fs/bcachefs/eytzinger.c @@ -115,7 +115,7 @@ static void swap_bytes(void *a, void *b, size_t n) struct wrapper { cmp_func_t cmp; - swap_func_t swap; + swap_func_t swap_func; }; /* @@ -125,7 +125,7 @@ struct wrapper { static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) { if (swap_func == SWAP_WRAPPER) { - ((const struct wrapper *)priv)->swap(a, b, (int)size); + ((const struct wrapper *)priv)->swap_func(a, b, (int)size); return; } @@ -174,7 +174,7 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size, int i, c, r; /* called from 'sort' without swap function, let's pick the default */ - if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap) + if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) swap_func = NULL; if (!swap_func) { @@ -227,7 +227,7 @@ void eytzinger0_sort(void *base, size_t n, size_t size, { struct wrapper w = { .cmp = cmp_func, - .swap = swap_func, + .swap_func = swap_func, }; return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index ee0e2df33322..24840aee335c 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -242,8 +242,8 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) (_i) = eytzinger0_next((_i), (_size))) /* return greatest node <= @search, or -1 if not found */ -static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) +static inline int eytzinger0_find_le(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) { unsigned i, n = 0; @@ -256,18 +256,32 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, } while (n < nr); if (n & 1) { - /* @i was greater than @search, return previous node: */ + /* + * @i was greater than @search, return previous node: + * + * if @i was leftmost/smallest element, + * eytzinger0_prev(eytzinger0_first())) returns -1, as expected + */ return eytzinger0_prev(i, nr); } else { return i; } } -static inline ssize_t eytzinger0_find_gt(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) +static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) { ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); - return eytzinger0_next(idx, size); + + /* + * if eytitzinger0_find_le() returned -1 - no element was <= search - we + * want to return the first element; next/prev identities mean this work + * as expected + * + * similarly if find_le() returns last element, we should return -1; + * identities mean this all works out: + */ + return eytzinger0_next(idx, nr); } #define eytzinger0_find(base, nr, size, _cmp, search) \ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index ab811c0dad26..04a577848b01 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -67,6 +67,8 @@ void bch2_journal_set_watermark(struct journal *j) track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb)) trace_and_count(c, journal_full, c); + mod_bit(JOURNAL_SPACE_LOW, &j->flags, low_on_space || low_on_pin); + swap(watermark, j->watermark); if (watermark > j->watermark) journal_wake(j); diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 8c053cb64ca5..b5161b5d76a0 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -134,6 +134,7 @@ enum journal_flags { JOURNAL_STARTED, JOURNAL_MAY_SKIP_FLUSH, JOURNAL_NEED_FLUSH_WRITE, + JOURNAL_SPACE_LOW, }; /* Reasons we may fail to get a journal reservation: */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index b76c16152579..0f328aba9760 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -47,20 +47,6 @@ void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) } } -static bool btree_id_is_alloc(enum btree_id id) -{ - switch (id) { - case BTREE_ID_alloc: - case BTREE_ID_backpointers: - case BTREE_ID_need_discard: - case BTREE_ID_freespace: - case BTREE_ID_bucket_gens: - return true; - default: - return false; - } -} - /* for -o reconstruct_alloc: */ static void bch2_reconstruct_alloc(struct bch_fs *c) { diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 0e806f04f3d7..544322d5c251 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -125,6 +125,15 @@ static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ances return s->parent; } +static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor) +{ + const struct snapshot_t *s = __snapshot_t(t, id); + if (!s) + return false; + + return test_bit(ancestor - id - 1, s->is_ancestor); +} + bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) { bool ret; @@ -140,13 +149,11 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) while (id && id < ancestor - IS_ANCESTOR_BITMAP) id = get_ancestor_below(t, id, ancestor); - if (id && id < ancestor) { - ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor); + ret = id && id < ancestor + ? test_ancestor_bitmap(t, id, ancestor) + : id == ancestor; - EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); - } else { - ret = id == ancestor; - } + EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); out: rcu_read_unlock(); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index ed63018f21be..8daf80a38d60 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -288,8 +288,13 @@ static void __bch2_fs_read_only(struct bch_fs *c) if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && !test_bit(BCH_FS_emergency_ro, &c->flags)) set_bit(BCH_FS_clean_shutdown, &c->flags); + bch2_fs_journal_stop(&c->journal); + bch_info(c, "%sshutdown complete, journal seq %llu", + test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", + c->journal.seq_ondisk); + /* * After stopping journal: */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index c86a93a8d8fc..b18b0cc81b59 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -17,7 +17,6 @@ #include "btree_iter.h" #include "btree_key_cache.h" #include "btree_update.h" -#include "btree_update_interior.h" #include "btree_gc.h" #include "buckets.h" #include "clock.h" @@ -166,7 +165,6 @@ read_attribute(btree_write_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); read_attribute(journal_debug); -read_attribute(btree_updates); read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(stripes_heap); @@ -415,9 +413,6 @@ SHOW(bch2_fs) if (attr == &sysfs_journal_debug) bch2_journal_debug_to_text(out, &c->journal); - if (attr == &sysfs_btree_updates) - bch2_btree_updates_to_text(out, c); - if (attr == &sysfs_btree_cache) bch2_btree_cache_to_text(out, c); @@ -639,7 +634,6 @@ SYSFS_OPS(bch2_fs_internal); struct attribute *bch2_fs_internal_files[] = { &sysfs_flags, &sysfs_journal_debug, - &sysfs_btree_updates, &sysfs_btree_cache, &sysfs_btree_key_cache, &sysfs_new_stripes, diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index b3fe9fc57747..bfec656f94c0 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, BTREE_ITER_INTENT); - k = bch2_btree_iter_peek(&iter); + k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)); ret = bkey_err(k); if (ret) goto err; diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index b7e7c29278fc..5cf885b09986 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -788,6 +788,14 @@ static inline int copy_from_user_errcode(void *to, const void __user *from, unsi #endif +static inline void mod_bit(long nr, volatile unsigned long *addr, bool v) +{ + if (v) + set_bit(nr, addr); + else + clear_bit(nr, addr); +} + static inline void __set_bit_le64(size_t bit, __le64 *addr) { addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); @@ -795,7 +803,7 @@ static inline void __set_bit_le64(size_t bit, __le64 *addr) static inline void __clear_bit_le64(size_t bit, __le64 *addr) { - addr[bit / 64] &= !cpu_to_le64(BIT_ULL(bit % 64)); + addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64)); } static inline bool test_bit_le64(size_t bit, __le64 *addr) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index dd6f566a383f..121ab890bd05 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1133,6 +1133,9 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, if (ret) return ret; + ret = btrfs_record_root_in_trans(trans, node->root); + if (ret) + return ret; ret = btrfs_update_delayed_inode(trans, node->root, path, node); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 37701531eeb1..c65fe5de4022 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2533,7 +2533,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, */ if (bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) - btrfs_delalloc_release_metadata(inode, len, false); + btrfs_delalloc_release_metadata(inode, len, true); /* For sanity tests. */ if (btrfs_is_testing(fs_info)) @@ -4503,6 +4503,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; struct btrfs_block_rsv block_rsv; u64 root_flags; + u64 qgroup_reserved = 0; int ret; down_write(&fs_info->subvol_sem); @@ -4547,12 +4548,20 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); if (ret) goto out_undead; + qgroup_reserved = block_rsv.qgroup_rsv_reserved; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_release; } + ret = btrfs_record_root_in_trans(trans, root); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; @@ -4611,7 +4620,9 @@ out_end_trans: ret = btrfs_end_transaction(trans); inode->i_flags |= S_DEAD; out_release: - btrfs_subvolume_release_metadata(root, &block_rsv); + btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); out_undead: if (ret) { spin_lock(&dest->root_item_lock); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 294e31edec9d..55f3ba6a831c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -613,6 +613,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, int ret; dev_t anon_dev; u64 objectid; + u64 qgroup_reserved = 0; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) @@ -650,13 +651,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap, trans_num_items, false); if (ret) goto out_new_inode_args; + qgroup_reserved = block_rsv.qgroup_rsv_reserved; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_subvolume_release_metadata(root, &block_rsv); - goto out_new_inode_args; + goto out_release_rsv; } + ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root); + if (ret) + goto out; + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; /* Tree log can't currently deal with an inode which is a new root. */ @@ -767,9 +773,11 @@ static noinline int create_subvol(struct mnt_idmap *idmap, out: trans->block_rsv = NULL; trans->bytes_reserved = 0; - btrfs_subvolume_release_metadata(root, &block_rsv); - btrfs_end_transaction(trans); +out_release_rsv: + btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: @@ -791,6 +799,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_pending_snapshot *pending_snapshot; unsigned int trans_num_items; struct btrfs_trans_handle *trans; + struct btrfs_block_rsv *block_rsv; + u64 qgroup_reserved = 0; int ret; /* We do not support snapshotting right now. */ @@ -827,19 +837,19 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, goto free_pending; } - btrfs_init_block_rsv(&pending_snapshot->block_rsv, - BTRFS_BLOCK_RSV_TEMP); + block_rsv = &pending_snapshot->block_rsv; + btrfs_init_block_rsv(block_rsv, BTRFS_BLOCK_RSV_TEMP); /* * 1 to add dir item * 1 to add dir index * 1 to update parent inode item */ trans_num_items = create_subvol_num_items(inherit) + 3; - ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, - &pending_snapshot->block_rsv, + ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, block_rsv, trans_num_items, false); if (ret) goto free_pending; + qgroup_reserved = block_rsv->qgroup_rsv_reserved; pending_snapshot->dentry = dentry; pending_snapshot->root = root; @@ -852,6 +862,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, ret = PTR_ERR(trans); goto fail; } + ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root); + if (ret) { + btrfs_end_transaction(trans); + goto fail; + } + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->pending_snapshot = pending_snapshot; @@ -881,7 +898,9 @@ fail: if (ret && pending_snapshot->snap) pending_snapshot->snap->anon_dev = 0; btrfs_put_root(pending_snapshot->snap); - btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv); + btrfs_block_rsv_release(fs_info, block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); free_pending: if (pending_snapshot->anon_dev) free_anon_bdev(pending_snapshot->anon_dev); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5f90f0605b12..cf8820ce7aa2 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4495,6 +4495,8 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) BTRFS_QGROUP_RSV_META_PREALLOC); trace_qgroup_meta_convert(root, num_bytes); qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); + if (!sb_rdonly(fs_info->sb)) + add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); } /* diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 4bb538a372ce..7007f9e0c972 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -548,13 +548,3 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, } return ret; } - -void btrfs_subvolume_release_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - u64 qgroup_to_release; - - btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release); - btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release); -} diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h index 6f929cf3bd49..8f5739e732b9 100644 --- a/fs/btrfs/root-tree.h +++ b/fs/btrfs/root-tree.h @@ -18,8 +18,6 @@ struct btrfs_trans_handle; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); -void btrfs_subvolume_release_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv); int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, const struct fscrypt_str *name); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 46e8426adf4f..85f359e0e0a7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -745,14 +745,6 @@ again: h->reloc_reserved = reloc_reserved; } - /* - * Now that we have found a transaction to be a part of, convert the - * qgroup reservation from prealloc to pertrans. A different transaction - * can't race in and free our pertrans out from under us. - */ - if (qgroup_reserved) - btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); - got_it: if (!current->journal_info) current->journal_info = h; @@ -786,8 +778,15 @@ got_it: * not just freed. */ btrfs_end_transaction(h); - return ERR_PTR(ret); + goto reserve_fail; } + /* + * Now that we have found a transaction to be a part of, convert the + * qgroup reservation from prealloc to pertrans. A different transaction + * can't race in and free our pertrans out from under us. + */ + if (qgroup_reserved) + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); return h; @@ -1495,6 +1494,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); + btrfs_qgroup_free_meta_all_pertrans(root); spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log(trans, root); @@ -1519,7 +1519,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) if (ret2) return ret2; spin_lock(&fs_info->fs_roots_radix_lock); - btrfs_qgroup_free_meta_all_pertrans(root); } } spin_unlock(&fs_info->fs_roots_radix_lock); diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index 902b326e1e56..87dcaae32ff8 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -62,12 +62,12 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size) break; dst += ret; } - if (ret >= 0 && boot_command_line[0]) { - ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n", - boot_command_line); - if (ret > 0) - dst += ret; - } + } + if (cmdline_has_extra_options() && ret >= 0 && boot_command_line[0]) { + ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n", + boot_command_line); + if (ret > 0) + dst += ret; } out: kfree(key); diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index ca73940e26df..e5ee2c694401 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -10,6 +10,7 @@ #ifdef __KERNEL__ #include <linux/kernel.h> #include <linux/types.h> +bool __init cmdline_has_extra_options(void); #else /* !__KERNEL__ */ /* * NOTE: This is only for tools/bootconfig, because tools/bootconfig will diff --git a/include/linux/compiler.h b/include/linux/compiler.h index c00cc6c0878a..8c252e073bd8 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -268,7 +268,7 @@ static inline void *offset_to_ptr(const int *off) * - When one operand is a null pointer constant (i.e. when x is an integer * constant expression) and the other is an object pointer (i.e. our * third operand), the conditional operator returns the type of the - * object pointer operand (i.e. "int *). Here, within the sizeof(), we + * object pointer operand (i.e. "int *"). Here, within the sizeof(), we * would then get: * sizeof(*((int *)(...)) == sizeof(int) == 4 * - When one operand is a void pointer (i.e. when x is not an integer diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 868c8fb1bbc1..13becafe41df 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -2,6 +2,8 @@ #ifndef __LINUX_GFP_TYPES_H #define __LINUX_GFP_TYPES_H +#include <linux/bits.h> + /* The typedef is in types.h but we want the documentation here */ #if 0 /** diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h index 5d868505a94e..6d92b68efbf6 100644 --- a/include/linux/randomize_kstack.h +++ b/include/linux/randomize_kstack.h @@ -80,7 +80,7 @@ DECLARE_PER_CPU(u32, kstack_offset); if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, \ &randomize_kstack_offset)) { \ u32 offset = raw_cpu_read(kstack_offset); \ - offset ^= (rand); \ + offset = ror32(offset, 5) ^ (rand); \ raw_cpu_write(kstack_offset, offset); \ } \ } while (0) diff --git a/init/main.c b/init/main.c index 2ca52474d0c3..881f6230ee59 100644 --- a/init/main.c +++ b/init/main.c @@ -487,6 +487,11 @@ static int __init warn_bootconfig(char *str) early_param("bootconfig", warn_bootconfig); +bool __init cmdline_has_extra_options(void) +{ + return extra_command_line || extra_init_args; +} + /* Change NUL term back to "=", to make "param" the whole string. */ static void __init repair_env_string(char *param, char *val) { diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9d9095e81792..65adc815fc6e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1567,10 +1567,17 @@ static int check_kprobe_address_safe(struct kprobe *p, jump_label_lock(); preempt_disable(); - /* Ensure it is not in reserved area nor out of text */ - if (!(core_kernel_text((unsigned long) p->addr) || - is_module_text_address((unsigned long) p->addr)) || - in_gate_area_no_mm((unsigned long) p->addr) || + /* Ensure the address is in a text area, and find a module if exists. */ + *probed_mod = NULL; + if (!core_kernel_text((unsigned long) p->addr)) { + *probed_mod = __module_text_address((unsigned long) p->addr); + if (!(*probed_mod)) { + ret = -EINVAL; + goto out; + } + } + /* Ensure it is not in reserved area. */ + if (in_gate_area_no_mm((unsigned long) p->addr) || within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr) || static_call_text_reserved(p->addr, p->addr) || @@ -1580,8 +1587,7 @@ static int check_kprobe_address_safe(struct kprobe *p, goto out; } - /* Check if 'p' is probing a module. */ - *probed_mod = __module_text_address((unsigned long) p->addr); + /* Get module refcount and reject __init functions for loaded modules. */ if (*probed_mod) { /* * We must hold a refcount of the probed module while updating diff --git a/lib/test_ubsan.c b/lib/test_ubsan.c index 276c12140ee2..c288df9372ed 100644 --- a/lib/test_ubsan.c +++ b/lib/test_ubsan.c @@ -134,7 +134,7 @@ static const test_ubsan_fp test_ubsan_array[] = { }; /* Excluded because they Oops the module. */ -static const test_ubsan_fp skip_ubsan_array[] = { +static __used const test_ubsan_fp skip_ubsan_array[] = { test_ubsan_divrem_overflow, }; diff --git a/scripts/gcc-plugins/stackleak_plugin.c b/scripts/gcc-plugins/stackleak_plugin.c index c5c2ce113c92..d20c47d21ad8 100644 --- a/scripts/gcc-plugins/stackleak_plugin.c +++ b/scripts/gcc-plugins/stackleak_plugin.c @@ -467,6 +467,8 @@ static bool stackleak_gate(void) return false; if (STRING_EQUAL(section, ".entry.text")) return false; + if (STRING_EQUAL(section, ".head.text")) + return false; } return track_frame_size >= 0; diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h index 4b0673bf52c2..07cfad817d53 100644 --- a/tools/include/linux/kernel.h +++ b/tools/include/linux/kernel.h @@ -8,6 +8,7 @@ #include <linux/build_bug.h> #include <linux/compiler.h> #include <linux/math.h> +#include <linux/panic.h> #include <endian.h> #include <byteswap.h> diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h index f3c82ab5b14c..7d73da098047 100644 --- a/tools/include/linux/mm.h +++ b/tools/include/linux/mm.h @@ -37,4 +37,9 @@ static inline void totalram_pages_add(long count) { } +static inline int early_pfn_to_nid(unsigned long pfn) +{ + return 0; +} + #endif diff --git a/tools/include/linux/panic.h b/tools/include/linux/panic.h new file mode 100644 index 000000000000..9c8f17a41ce8 --- /dev/null +++ b/tools/include/linux/panic.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_LINUX_PANIC_H +#define _TOOLS_LINUX_PANIC_H + +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> + +static inline void panic(const char *fmt, ...) +{ + va_list argp; + + va_start(argp, fmt); + vfprintf(stderr, fmt, argp); + va_end(argp); + exit(-1); +} + +#endif diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 8f08c3fd498d..0d3672e5d9ed 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -67,6 +67,10 @@ The column name "all" can be used to enable all disabled-by-default built-in cou .PP \fB--quiet\fP Do not decode and print the system configuration header information. .PP ++\fB--no-msr\fP Disable all the uses of the MSR driver. ++.PP ++\fB--no-perf\fP Disable all the uses of the perf API. ++.PP \fB--interval seconds\fP overrides the default 5.0 second measurement interval. .PP \fB--num_iterations num\fP number of the measurement iterations. @@ -125,9 +129,17 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBPkgTmp\fP Degrees Celsius reported by the per-package Package Thermal Monitor. .PP -\fBGFX%rc6\fP The percentage of time the GPU is in the "render C6" state, rc6, during the measurement interval. From /sys/class/drm/card0/power/rc6_residency_ms. +\fBGFX%rc6\fP The percentage of time the GPU is in the "render C6" state, rc6, during the measurement interval. From /sys/class/drm/card0/power/rc6_residency_ms or /sys/class/drm/card0/gt/gt0/rc6_residency_ms or /sys/class/drm/card0/device/tile0/gtN/gtidle/idle_residency_ms depending on the graphics driver being used. .PP -\fBGFXMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz. +\fBGFXMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz or /sys/class/drm/card0/gt_cur_freq_mhz or /sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/cur_freq depending on the graphics driver being used. +.PP +\fBGFXAMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz or /sys/class/drm/card0/gt_act_freq_mhz or /sys/class/drm/card0/gt/gt0/rps_act_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/act_freq depending on the graphics driver being used. +.PP +\fBSAM%mc6\fP The percentage of time the SA Media is in the "module C6" state, mc6, during the measurement interval. From /sys/class/drm/card0/gt/gt1/rc6_residency_ms or /sys/class/drm/card0/device/tile0/gtN/gtidle/idle_residency_ms depending on the graphics driver being used. +.PP +\fBSAMMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/cur_freq depending on the graphics driver being used. +.PP +\fBSAMAMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/drm/card0/gt/gt1/rps_act_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/act_freq depending on the graphics driver being used. .PP \fBPkg%pc2, Pkg%pc3, Pkg%pc6, Pkg%pc7\fP percentage residency in hardware package idle states. These numbers are from hardware residency counters. .PP @@ -370,7 +382,7 @@ below the processor's base frequency. Busy% = MPERF_delta/TSC_delta -Bzy_MHz = TSC_delta/APERF_delta/MPERF_delta/measurement_interval +Bzy_MHz = TSC_delta*APERF_delta/MPERF_delta/measurement_interval Note that these calculations depend on TSC_delta, so they are not reliable during intervals when TSC_MHz is not running at the base frequency. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 7a334377f92b..98256468e248 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3,7 +3,7 @@ * turbostat -- show CPU frequency and C-state residency * on modern Intel and AMD processors. * - * Copyright (c) 2023 Intel Corporation. + * Copyright (c) 2024 Intel Corporation. * Len Brown <len.brown@intel.com> */ @@ -36,6 +36,8 @@ #include <linux/perf_event.h> #include <asm/unistd.h> #include <stdbool.h> +#include <assert.h> +#include <linux/kernel.h> #define UNUSED(x) (void)(x) @@ -53,9 +55,13 @@ #define NAME_BYTES 20 #define PATH_BYTES 128 +#define MAX_NOFILE 0x8000 + enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; +enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; +enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR }; struct msr_counter { unsigned int msr_num; @@ -127,6 +133,9 @@ struct msr_counter bic[] = { { 0x0, "IPC", "", 0, 0, 0, NULL, 0 }, { 0x0, "CoreThr", "", 0, 0, 0, NULL, 0 }, { 0x0, "UncMHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "SAM%mc6", "", 0, 0, 0, NULL, 0 }, + { 0x0, "SAMMHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "SAMAMHz", "", 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -185,11 +194,14 @@ struct msr_counter bic[] = { #define BIC_IPC (1ULL << 52) #define BIC_CORE_THROT_CNT (1ULL << 53) #define BIC_UNCORE_MHZ (1ULL << 54) +#define BIC_SAM_mc6 (1ULL << 55) +#define BIC_SAMMHz (1ULL << 56) +#define BIC_SAMACTMHz (1ULL << 57) #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) -#define BIC_FREQUENCY ( BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_UNCORE_MHZ) -#define BIC_IDLE ( BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX) +#define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) +#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6) #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) @@ -204,10 +216,13 @@ unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT) #define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT) +struct amperf_group_fd; + char *proc_stat = "/proc/stat"; FILE *outf; int *fd_percpu; int *fd_instr_count_percpu; +struct amperf_group_fd *fd_amperf_percpu; /* File descriptors for perf group with APERF and MPERF counters. */ struct timeval interval_tv = { 5, 0 }; struct timespec interval_ts = { 5, 0 }; @@ -242,11 +257,8 @@ char *output_buffer, *outp; unsigned int do_dts; unsigned int do_ptm; unsigned int do_ipc; -unsigned long long gfx_cur_rc6_ms; unsigned long long cpuidle_cur_cpu_lpi_us; unsigned long long cpuidle_cur_sys_lpi_us; -unsigned int gfx_cur_mhz; -unsigned int gfx_act_mhz; unsigned int tj_max; unsigned int tj_max_override; double rapl_power_units, rapl_time_units; @@ -263,6 +275,28 @@ unsigned int has_hwp_epp; /* IA32_HWP_REQUEST[bits 31:24] */ unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */ unsigned int first_counter_read = 1; int ignore_stdin; +bool no_msr; +bool no_perf; +enum amperf_source amperf_source; + +enum gfx_sysfs_idx { + GFX_rc6, + GFX_MHz, + GFX_ACTMHz, + SAM_mc6, + SAM_MHz, + SAM_ACTMHz, + GFX_MAX +}; + +struct gfx_sysfs_info { + const char *path; + FILE *fp; + unsigned int val; + unsigned long long val_ull; +}; + +static struct gfx_sysfs_info gfx_info[GFX_MAX]; int get_msr(int cpu, off_t offset, unsigned long long *msr); @@ -652,6 +686,7 @@ static const struct platform_features icx_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_ICX, + .has_msr_core_c1_res = 1, .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, @@ -948,6 +983,175 @@ size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affi #define MAX_ADDED_THREAD_COUNTERS 24 #define BITMASK_SIZE 32 +/* Indexes used to map data read from perf and MSRs into global variables */ +enum rapl_rci_index { + RAPL_RCI_INDEX_ENERGY_PKG = 0, + RAPL_RCI_INDEX_ENERGY_CORES = 1, + RAPL_RCI_INDEX_DRAM = 2, + RAPL_RCI_INDEX_GFX = 3, + RAPL_RCI_INDEX_PKG_PERF_STATUS = 4, + RAPL_RCI_INDEX_DRAM_PERF_STATUS = 5, + RAPL_RCI_INDEX_CORE_ENERGY = 6, + NUM_RAPL_COUNTERS, +}; + +enum rapl_unit { + RAPL_UNIT_INVALID, + RAPL_UNIT_JOULES, + RAPL_UNIT_WATTS, +}; + +struct rapl_counter_info_t { + unsigned long long data[NUM_RAPL_COUNTERS]; + enum rapl_source source[NUM_RAPL_COUNTERS]; + unsigned long long flags[NUM_RAPL_COUNTERS]; + double scale[NUM_RAPL_COUNTERS]; + enum rapl_unit unit[NUM_RAPL_COUNTERS]; + + union { + /* Active when source == RAPL_SOURCE_MSR */ + struct { + unsigned long long msr[NUM_RAPL_COUNTERS]; + unsigned long long msr_mask[NUM_RAPL_COUNTERS]; + int msr_shift[NUM_RAPL_COUNTERS]; + }; + }; + + int fd_perf; +}; + +/* struct rapl_counter_info_t for each RAPL domain */ +struct rapl_counter_info_t *rapl_counter_info_perdomain; + +#define RAPL_COUNTER_FLAG_USE_MSR_SUM (1u << 1) + +struct rapl_counter_arch_info { + int feature_mask; /* Mask for testing if the counter is supported on host */ + const char *perf_subsys; + const char *perf_name; + unsigned long long msr; + unsigned long long msr_mask; + int msr_shift; /* Positive mean shift right, negative mean shift left */ + double *platform_rapl_msr_scale; /* Scale applied to values read by MSR (platform dependent, filled at runtime) */ + unsigned int rci_index; /* Maps data from perf counters to global variables */ + unsigned long long bic; + double compat_scale; /* Some counters require constant scaling to be in the same range as other, similar ones */ + unsigned long long flags; +}; + +static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = { + { + .feature_mask = RAPL_PKG, + .perf_subsys = "power", + .perf_name = "energy-pkg", + .msr = MSR_PKG_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_PKG, + .bic = BIC_PkgWatt | BIC_Pkg_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_AMD_F17H, + .perf_subsys = "power", + .perf_name = "energy-pkg", + .msr = MSR_PKG_ENERGY_STAT, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_PKG, + .bic = BIC_PkgWatt | BIC_Pkg_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_CORE_ENERGY_STATUS, + .perf_subsys = "power", + .perf_name = "energy-cores", + .msr = MSR_PP0_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_CORES, + .bic = BIC_CorWatt | BIC_Cor_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_DRAM, + .perf_subsys = "power", + .perf_name = "energy-ram", + .msr = MSR_DRAM_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_dram_energy_units, + .rci_index = RAPL_RCI_INDEX_DRAM, + .bic = BIC_RAMWatt | BIC_RAM_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_GFX, + .perf_subsys = "power", + .perf_name = "energy-gpu", + .msr = MSR_PP1_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_GFX, + .bic = BIC_GFXWatt | BIC_GFX_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_PKG_PERF_STATUS, + .perf_subsys = NULL, + .perf_name = NULL, + .msr = MSR_PKG_PERF_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_time_units, + .rci_index = RAPL_RCI_INDEX_PKG_PERF_STATUS, + .bic = BIC_PKG__, + .compat_scale = 100.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_DRAM_PERF_STATUS, + .perf_subsys = NULL, + .perf_name = NULL, + .msr = MSR_DRAM_PERF_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_time_units, + .rci_index = RAPL_RCI_INDEX_DRAM_PERF_STATUS, + .bic = BIC_RAM__, + .compat_scale = 100.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_AMD_F17H, + .perf_subsys = NULL, + .perf_name = NULL, + .msr = MSR_CORE_ENERGY_STAT, + .msr_mask = 0xFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_CORE_ENERGY, + .bic = BIC_CorWatt | BIC_Cor_J, + .compat_scale = 1.0, + .flags = 0, + }, +}; + +struct rapl_counter { + unsigned long long raw_value; + enum rapl_unit unit; + double scale; +}; + struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -974,7 +1178,7 @@ struct core_data { unsigned long long c7; unsigned long long mc6_us; /* duplicate as per-core for now, even though per module */ unsigned int core_temp_c; - unsigned int core_energy; /* MSR_CORE_ENERGY_STAT */ + struct rapl_counter core_energy; /* MSR_CORE_ENERGY_STAT */ unsigned int core_id; unsigned long long core_throt_cnt; unsigned long long counter[MAX_ADDED_COUNTERS]; @@ -989,8 +1193,8 @@ struct pkg_data { unsigned long long pc8; unsigned long long pc9; unsigned long long pc10; - unsigned long long cpu_lpi; - unsigned long long sys_lpi; + long long cpu_lpi; + long long sys_lpi; unsigned long long pkg_wtd_core_c0; unsigned long long pkg_any_core_c0; unsigned long long pkg_any_gfxe_c0; @@ -998,13 +1202,16 @@ struct pkg_data { long long gfx_rc6_ms; unsigned int gfx_mhz; unsigned int gfx_act_mhz; + long long sam_mc6_ms; + unsigned int sam_mhz; + unsigned int sam_act_mhz; unsigned int package_id; - unsigned long long energy_pkg; /* MSR_PKG_ENERGY_STATUS */ - unsigned long long energy_dram; /* MSR_DRAM_ENERGY_STATUS */ - unsigned long long energy_cores; /* MSR_PP0_ENERGY_STATUS */ - unsigned long long energy_gfx; /* MSR_PP1_ENERGY_STATUS */ - unsigned long long rapl_pkg_perf_status; /* MSR_PKG_PERF_STATUS */ - unsigned long long rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ + struct rapl_counter energy_pkg; /* MSR_PKG_ENERGY_STATUS */ + struct rapl_counter energy_dram; /* MSR_DRAM_ENERGY_STATUS */ + struct rapl_counter energy_cores; /* MSR_PP0_ENERGY_STATUS */ + struct rapl_counter energy_gfx; /* MSR_PP1_ENERGY_STATUS */ + struct rapl_counter rapl_pkg_perf_status; /* MSR_PKG_PERF_STATUS */ + struct rapl_counter rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ unsigned int pkg_temp_c; unsigned int uncore_mhz; unsigned long long counter[MAX_ADDED_COUNTERS]; @@ -1150,6 +1357,38 @@ struct sys_counters { struct msr_counter *pp; } sys; +void free_sys_counters(void) +{ + struct msr_counter *p = sys.tp, *pnext = NULL; + + while (p) { + pnext = p->next; + free(p); + p = pnext; + } + + p = sys.cp, pnext = NULL; + while (p) { + pnext = p->next; + free(p); + p = pnext; + } + + p = sys.pp, pnext = NULL; + while (p) { + pnext = p->next; + free(p); + p = pnext; + } + + sys.added_thread_counters = 0; + sys.added_core_counters = 0; + sys.added_package_counters = 0; + sys.tp = NULL; + sys.cp = NULL; + sys.pp = NULL; +} + struct system_summary { struct thread_data threads; struct core_data cores; @@ -1280,34 +1519,60 @@ int get_msr_fd(int cpu) sprintf(pathname, "/dev/cpu/%d/msr", cpu); fd = open(pathname, O_RDONLY); if (fd < 0) - err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, or run as root", pathname); + err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, " + "or run with --no-msr, or run as root", pathname); fd_percpu[cpu] = fd; return fd; } +static void bic_disable_msr_access(void) +{ + const unsigned long bic_msrs = + BIC_SMI | + BIC_CPU_c1 | + BIC_CPU_c3 | + BIC_CPU_c6 | + BIC_CPU_c7 | + BIC_Mod_c6 | + BIC_CoreTmp | + BIC_Totl_c0 | + BIC_Any_c0 | + BIC_GFX_c0 | + BIC_CPUGFX | + BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_PkgTmp; + + bic_enabled &= ~bic_msrs; + + free_sys_counters(); +} + static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { + assert(!no_perf); + return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); } -static int perf_instr_count_open(int cpu_num) +static long open_perf_counter(int cpu, unsigned int type, unsigned int config, int group_fd, __u64 read_format) { - struct perf_event_attr pea; - int fd; + struct perf_event_attr attr; + const pid_t pid = -1; + const unsigned long flags = 0; - memset(&pea, 0, sizeof(struct perf_event_attr)); - pea.type = PERF_TYPE_HARDWARE; - pea.size = sizeof(struct perf_event_attr); - pea.config = PERF_COUNT_HW_INSTRUCTIONS; + assert(!no_perf); - /* counter for cpu_num, including user + kernel and all processes */ - fd = perf_event_open(&pea, -1, cpu_num, -1, 0); - if (fd == -1) { - warnx("capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\"", progname); - BIC_NOT_PRESENT(BIC_IPC); - } + memset(&attr, 0, sizeof(struct perf_event_attr)); + + attr.type = type; + attr.size = sizeof(struct perf_event_attr); + attr.config = config; + attr.disabled = 0; + attr.sample_type = PERF_SAMPLE_IDENTIFIER; + attr.read_format = read_format; + + const int fd = perf_event_open(&attr, pid, cpu, group_fd, flags); return fd; } @@ -1317,7 +1582,7 @@ int get_instr_count_fd(int cpu) if (fd_instr_count_percpu[cpu]) return fd_instr_count_percpu[cpu]; - fd_instr_count_percpu[cpu] = perf_instr_count_open(cpu); + fd_instr_count_percpu[cpu] = open_perf_counter(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); return fd_instr_count_percpu[cpu]; } @@ -1326,6 +1591,8 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr) { ssize_t retval; + assert(!no_msr); + retval = pread(get_msr_fd(cpu), msr, sizeof(*msr), offset); if (retval != sizeof *msr) @@ -1334,6 +1601,21 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr) return 0; } +int probe_msr(int cpu, off_t offset) +{ + ssize_t retval; + unsigned long long dummy; + + assert(!no_msr); + + retval = pread(get_msr_fd(cpu), &dummy, sizeof(dummy), offset); + + if (retval != sizeof(dummy)) + return 1; + + return 0; +} + #define MAX_DEFERRED 16 char *deferred_add_names[MAX_DEFERRED]; char *deferred_skip_names[MAX_DEFERRED]; @@ -1369,6 +1651,8 @@ void help(void) " Override default 5-second measurement interval\n" " -J, --Joules displays energy in Joules instead of Watts\n" " -l, --list list column headers only\n" + " -M, --no-msr Disable all uses of the MSR driver\n" + " -P, --no-perf Disable all uses of the perf API\n" " -n, --num_iterations num\n" " number of the measurement iterations\n" " -N, --header_iterations num\n" @@ -1573,6 +1857,15 @@ void print_header(char *delim) if (DO_BIC(BIC_GFXACTMHz)) outp += sprintf(outp, "%sGFXAMHz", (printed++ ? delim : "")); + if (DO_BIC(BIC_SAM_mc6)) + outp += sprintf(outp, "%sSAM%%mc6", (printed++ ? delim : "")); + + if (DO_BIC(BIC_SAMMHz)) + outp += sprintf(outp, "%sSAMMHz", (printed++ ? delim : "")); + + if (DO_BIC(BIC_SAMACTMHz)) + outp += sprintf(outp, "%sSAMAMHz", (printed++ ? delim : "")); + if (DO_BIC(BIC_Totl_c0)) outp += sprintf(outp, "%sTotl%%C0", (printed++ ? delim : "")); if (DO_BIC(BIC_Any_c0)) @@ -1671,26 +1964,35 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "SMI: %d\n", t->smi_count); for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "tADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, t->counter[i]); + outp += + sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, + t->counter[i], mp->path); } } - if (c) { + if (c && is_cpu_first_thread_in_core(t, c, p)) { outp += sprintf(outp, "core: %d\n", c->core_id); outp += sprintf(outp, "c3: %016llX\n", c->c3); outp += sprintf(outp, "c6: %016llX\n", c->c6); outp += sprintf(outp, "c7: %016llX\n", c->c7); outp += sprintf(outp, "DTS: %dC\n", c->core_temp_c); outp += sprintf(outp, "cpu_throt_count: %016llX\n", c->core_throt_cnt); - outp += sprintf(outp, "Joules: %0X\n", c->core_energy); + + const unsigned long long energy_value = c->core_energy.raw_value * c->core_energy.scale; + const double energy_scale = c->core_energy.scale; + + if (c->core_energy.unit == RAPL_UNIT_JOULES) + outp += sprintf(outp, "Joules: %0llX (scale: %lf)\n", energy_value, energy_scale); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "cADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, c->counter[i]); + outp += + sprintf(outp, "cADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, + c->counter[i], mp->path); } outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us); } - if (p) { + if (p && is_cpu_first_core_in_package(t, c, p)) { outp += sprintf(outp, "package: %d\n", p->package_id); outp += sprintf(outp, "Weighted cores: %016llX\n", p->pkg_wtd_core_c0); @@ -1710,16 +2012,18 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "pc10: %016llX\n", p->pc10); outp += sprintf(outp, "cpu_lpi: %016llX\n", p->cpu_lpi); outp += sprintf(outp, "sys_lpi: %016llX\n", p->sys_lpi); - outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg); - outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores); - outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx); - outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram); - outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status); - outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status); + outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg.raw_value); + outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores.raw_value); + outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx.raw_value); + outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram.raw_value); + outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status.raw_value); + outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status.raw_value); outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "pADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, p->counter[i]); + outp += + sprintf(outp, "pADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, + p->counter[i], mp->path); } } @@ -1728,6 +2032,23 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p return 0; } +double rapl_counter_get_value(const struct rapl_counter *c, enum rapl_unit desired_unit, double interval) +{ + assert(desired_unit != RAPL_UNIT_INVALID); + + /* + * For now we don't expect anything other than joules, + * so just simplify the logic. + */ + assert(c->unit == RAPL_UNIT_JOULES); + + const double scaled = c->raw_value * c->scale; + + if (desired_unit == RAPL_UNIT_WATTS) + return scaled / interval; + return scaled; +} + /* * column formatting convention & formats */ @@ -1921,9 +2242,11 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float); + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&c->core_energy, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&c->core_energy, RAPL_UNIT_JOULES, interval_float)); /* print per-package data only for 1st core in package */ if (!is_cpu_first_core_in_package(t, c, p)) @@ -1951,6 +2274,24 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_GFXACTMHz)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_act_mhz); + /* SAMmc6 */ + if (DO_BIC(BIC_SAM_mc6)) { + if (p->sam_mc6_ms == -1) { /* detect GFX counter reset */ + outp += sprintf(outp, "%s**.**", (printed++ ? delim : "")); + } else { + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + p->sam_mc6_ms / 10.0 / interval_float); + } + } + + /* SAMMHz */ + if (DO_BIC(BIC_SAMMHz)) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_mhz); + + /* SAMACTMHz */ + if (DO_BIC(BIC_SAMACTMHz)) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_act_mhz); + /* Totl%C0, Any%C0 GFX%C0 CPUGFX% */ if (DO_BIC(BIC_Totl_c0)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0 / tsc); @@ -1976,43 +2317,59 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_Pkgpc10)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc); - if (DO_BIC(BIC_CPU_LPI)) - outp += - sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->cpu_lpi / 1000000.0 / interval_float); - if (DO_BIC(BIC_SYS_LPI)) - outp += - sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->sys_lpi / 1000000.0 / interval_float); + if (DO_BIC(BIC_CPU_LPI)) { + if (p->cpu_lpi >= 0) + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + 100.0 * p->cpu_lpi / 1000000.0 / interval_float); + else + outp += sprintf(outp, "%s(neg)", (printed++ ? delim : "")); + } + if (DO_BIC(BIC_SYS_LPI)) { + if (p->sys_lpi >= 0) + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + 100.0 * p->sys_lpi / 1000000.0 / interval_float); + else + outp += sprintf(outp, "%s(neg)", (printed++ ? delim : "")); + } if (DO_BIC(BIC_PkgWatt)) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float); - + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float); + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_GFXWatt)) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units / interval_float); + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_RAMWatt)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - p->energy_dram * rapl_dram_energy_units / interval_float); + rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_Pkg_J)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_GFX_J)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_RAM_J)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_dram * rapl_dram_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_PKG__)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float); + rapl_counter_get_value(&p->rapl_pkg_perf_status, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_RAM__)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float); + rapl_counter_get_value(&p->rapl_dram_perf_status, RAPL_UNIT_WATTS, interval_float)); /* UncMHz */ if (DO_BIC(BIC_UNCORE_MHZ)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->uncore_mhz); @@ -2121,12 +2478,22 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->gfx_mhz = new->gfx_mhz; old->gfx_act_mhz = new->gfx_act_mhz; - old->energy_pkg = new->energy_pkg - old->energy_pkg; - old->energy_cores = new->energy_cores - old->energy_cores; - old->energy_gfx = new->energy_gfx - old->energy_gfx; - old->energy_dram = new->energy_dram - old->energy_dram; - old->rapl_pkg_perf_status = new->rapl_pkg_perf_status - old->rapl_pkg_perf_status; - old->rapl_dram_perf_status = new->rapl_dram_perf_status - old->rapl_dram_perf_status; + /* flag an error when mc6 counter resets/wraps */ + if (old->sam_mc6_ms > new->sam_mc6_ms) + old->sam_mc6_ms = -1; + else + old->sam_mc6_ms = new->sam_mc6_ms - old->sam_mc6_ms; + + old->sam_mhz = new->sam_mhz; + old->sam_act_mhz = new->sam_act_mhz; + + old->energy_pkg.raw_value = new->energy_pkg.raw_value - old->energy_pkg.raw_value; + old->energy_cores.raw_value = new->energy_cores.raw_value - old->energy_cores.raw_value; + old->energy_gfx.raw_value = new->energy_gfx.raw_value - old->energy_gfx.raw_value; + old->energy_dram.raw_value = new->energy_dram.raw_value - old->energy_dram.raw_value; + old->rapl_pkg_perf_status.raw_value = new->rapl_pkg_perf_status.raw_value - old->rapl_pkg_perf_status.raw_value; + old->rapl_dram_perf_status.raw_value = + new->rapl_dram_perf_status.raw_value - old->rapl_dram_perf_status.raw_value; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2150,7 +2517,7 @@ void delta_core(struct core_data *new, struct core_data *old) old->core_throt_cnt = new->core_throt_cnt; old->mc6_us = new->mc6_us - old->mc6_us; - DELTA_WRAP32(new->core_energy, old->core_energy); + DELTA_WRAP32(new->core_energy.raw_value, old->core_energy.raw_value); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2277,6 +2644,13 @@ int delta_cpu(struct thread_data *t, struct core_data *c, return retval; } +void rapl_counter_clear(struct rapl_counter *c) +{ + c->raw_value = 0; + c->scale = 0.0; + c->unit = RAPL_UNIT_INVALID; +} + void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; @@ -2304,7 +2678,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data c->c7 = 0; c->mc6_us = 0; c->core_temp_c = 0; - c->core_energy = 0; + rapl_counter_clear(&c->core_energy); c->core_throt_cnt = 0; p->pkg_wtd_core_c0 = 0; @@ -2325,18 +2699,21 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->cpu_lpi = 0; p->sys_lpi = 0; - p->energy_pkg = 0; - p->energy_dram = 0; - p->energy_cores = 0; - p->energy_gfx = 0; - p->rapl_pkg_perf_status = 0; - p->rapl_dram_perf_status = 0; + rapl_counter_clear(&p->energy_pkg); + rapl_counter_clear(&p->energy_dram); + rapl_counter_clear(&p->energy_cores); + rapl_counter_clear(&p->energy_gfx); + rapl_counter_clear(&p->rapl_pkg_perf_status); + rapl_counter_clear(&p->rapl_dram_perf_status); p->pkg_temp_c = 0; p->gfx_rc6_ms = 0; p->uncore_mhz = 0; p->gfx_mhz = 0; p->gfx_act_mhz = 0; + p->sam_mc6_ms = 0; + p->sam_mhz = 0; + p->sam_act_mhz = 0; for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) t->counter[i] = 0; @@ -2347,6 +2724,20 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->counter[i] = 0; } +void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src) +{ + /* Copy unit and scale from src if dst is not initialized */ + if (dst->unit == RAPL_UNIT_INVALID) { + dst->unit = src->unit; + dst->scale = src->scale; + } + + assert(dst->unit == src->unit); + assert(dst->scale == src->scale); + + dst->raw_value += src->raw_value; +} + int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; @@ -2393,7 +2784,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.cores.core_temp_c = MAX(average.cores.core_temp_c, c->core_temp_c); average.cores.core_throt_cnt = MAX(average.cores.core_throt_cnt, c->core_throt_cnt); - average.cores.core_energy += c->core_energy; + rapl_counter_accumulate(&average.cores.core_energy, &c->core_energy); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2428,25 +2819,29 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.cpu_lpi = p->cpu_lpi; average.packages.sys_lpi = p->sys_lpi; - average.packages.energy_pkg += p->energy_pkg; - average.packages.energy_dram += p->energy_dram; - average.packages.energy_cores += p->energy_cores; - average.packages.energy_gfx += p->energy_gfx; + rapl_counter_accumulate(&average.packages.energy_pkg, &p->energy_pkg); + rapl_counter_accumulate(&average.packages.energy_dram, &p->energy_dram); + rapl_counter_accumulate(&average.packages.energy_cores, &p->energy_cores); + rapl_counter_accumulate(&average.packages.energy_gfx, &p->energy_gfx); average.packages.gfx_rc6_ms = p->gfx_rc6_ms; average.packages.uncore_mhz = p->uncore_mhz; average.packages.gfx_mhz = p->gfx_mhz; average.packages.gfx_act_mhz = p->gfx_act_mhz; + average.packages.sam_mc6_ms = p->sam_mc6_ms; + average.packages.sam_mhz = p->sam_mhz; + average.packages.sam_act_mhz = p->sam_act_mhz; average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c); - average.packages.rapl_pkg_perf_status += p->rapl_pkg_perf_status; - average.packages.rapl_dram_perf_status += p->rapl_dram_perf_status; + rapl_counter_accumulate(&average.packages.rapl_pkg_perf_status, &p->rapl_pkg_perf_status); + rapl_counter_accumulate(&average.packages.rapl_dram_perf_status, &p->rapl_dram_perf_status); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - if (mp->format == FORMAT_RAW) - continue; - average.packages.counter[i] += p->counter[i]; + if ((mp->format == FORMAT_RAW) && (topo.num_packages == 0)) + average.packages.counter[i] = p->counter[i]; + else + average.packages.counter[i] += p->counter[i]; } return 0; } @@ -2578,6 +2973,7 @@ unsigned long long snapshot_sysfs_counter(char *path) int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) { if (mp->msr_num != 0) { + assert(!no_msr); if (get_msr(cpu, mp->msr_num, counterp)) return -1; } else { @@ -2599,7 +2995,7 @@ unsigned long long get_uncore_mhz(int package, int die) { char path[128]; - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/current_freq_khz", package, + sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", package, die); return (snapshot_sysfs_counter(path) / 1000); @@ -2627,6 +3023,9 @@ int get_epb(int cpu) return epb; msr_fallback: + if (no_msr) + return -1; + get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr); return msr & 0xf; @@ -2700,6 +3099,351 @@ int get_core_throt_cnt(int cpu, unsigned long long *cnt) return 0; } +struct amperf_group_fd { + int aperf; /* Also the group descriptor */ + int mperf; +}; + +static int read_perf_counter_info(const char *const path, const char *const parse_format, void *value_ptr) +{ + int fdmt; + int bytes_read; + char buf[64]; + int ret = -1; + + fdmt = open(path, O_RDONLY, 0); + if (fdmt == -1) { + if (debug) + fprintf(stderr, "Failed to parse perf counter info %s\n", path); + ret = -1; + goto cleanup_and_exit; + } + + bytes_read = read(fdmt, buf, sizeof(buf) - 1); + if (bytes_read <= 0 || bytes_read >= (int)sizeof(buf)) { + if (debug) + fprintf(stderr, "Failed to parse perf counter info %s\n", path); + ret = -1; + goto cleanup_and_exit; + } + + buf[bytes_read] = '\0'; + + if (sscanf(buf, parse_format, value_ptr) != 1) { + if (debug) + fprintf(stderr, "Failed to parse perf counter info %s\n", path); + ret = -1; + goto cleanup_and_exit; + } + + ret = 0; + +cleanup_and_exit: + close(fdmt); + return ret; +} + +static unsigned int read_perf_counter_info_n(const char *const path, const char *const parse_format) +{ + unsigned int v; + int status; + + status = read_perf_counter_info(path, parse_format, &v); + if (status) + v = -1; + + return v; +} + +static unsigned int read_msr_type(void) +{ + const char *const path = "/sys/bus/event_source/devices/msr/type"; + const char *const format = "%u"; + + return read_perf_counter_info_n(path, format); +} + +static unsigned int read_aperf_config(void) +{ + const char *const path = "/sys/bus/event_source/devices/msr/events/aperf"; + const char *const format = "event=%x"; + + return read_perf_counter_info_n(path, format); +} + +static unsigned int read_mperf_config(void) +{ + const char *const path = "/sys/bus/event_source/devices/msr/events/mperf"; + const char *const format = "event=%x"; + + return read_perf_counter_info_n(path, format); +} + +static unsigned int read_perf_type(const char *subsys) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/type"; + const char *const format = "%u"; + char path[128]; + + snprintf(path, sizeof(path), path_format, subsys); + + return read_perf_counter_info_n(path, format); +} + +static unsigned int read_rapl_config(const char *subsys, const char *event_name) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s"; + const char *const format = "event=%x"; + char path[128]; + + snprintf(path, sizeof(path), path_format, subsys, event_name); + + return read_perf_counter_info_n(path, format); +} + +static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_name) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.unit"; + const char *const format = "%s"; + char path[128]; + char unit_buffer[16]; + + snprintf(path, sizeof(path), path_format, subsys, event_name); + + read_perf_counter_info(path, format, &unit_buffer); + if (strcmp("Joules", unit_buffer) == 0) + return RAPL_UNIT_JOULES; + + return RAPL_UNIT_INVALID; +} + +static double read_perf_rapl_scale(const char *subsys, const char *event_name) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.scale"; + const char *const format = "%lf"; + char path[128]; + double scale; + + snprintf(path, sizeof(path), path_format, subsys, event_name); + + if (read_perf_counter_info(path, format, &scale)) + return 0.0; + + return scale; +} + +static struct amperf_group_fd open_amperf_fd(int cpu) +{ + const unsigned int msr_type = read_msr_type(); + const unsigned int aperf_config = read_aperf_config(); + const unsigned int mperf_config = read_mperf_config(); + struct amperf_group_fd fds = {.aperf = -1, .mperf = -1 }; + + fds.aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP); + fds.mperf = open_perf_counter(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP); + + return fds; +} + +static int get_amperf_fd(int cpu) +{ + assert(fd_amperf_percpu); + + if (fd_amperf_percpu[cpu].aperf) + return fd_amperf_percpu[cpu].aperf; + + fd_amperf_percpu[cpu] = open_amperf_fd(cpu); + + return fd_amperf_percpu[cpu].aperf; +} + +/* Read APERF, MPERF and TSC using the perf API. */ +static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu) +{ + union { + struct { + unsigned long nr_entries; + unsigned long aperf; + unsigned long mperf; + }; + + unsigned long as_array[3]; + } cnt; + + const int fd_amperf = get_amperf_fd(cpu); + + /* + * Read the TSC with rdtsc, because we want the absolute value and not + * the offset from the start of the counter. + */ + t->tsc = rdtsc(); + + const int n = read(fd_amperf, &cnt.as_array[0], sizeof(cnt.as_array)); + + if (n != sizeof(cnt.as_array)) + return -2; + + t->aperf = cnt.aperf * aperf_mperf_multiplier; + t->mperf = cnt.mperf * aperf_mperf_multiplier; + + return 0; +} + +/* Read APERF, MPERF and TSC using the MSR driver and rdtsc instruction. */ +static int read_aperf_mperf_tsc_msr(struct thread_data *t, int cpu) +{ + unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time; + int aperf_mperf_retry_count = 0; + + /* + * The TSC, APERF and MPERF must be read together for + * APERF/MPERF and MPERF/TSC to give accurate results. + * + * Unfortunately, APERF and MPERF are read by + * individual system call, so delays may occur + * between them. If the time to read them + * varies by a large amount, we re-read them. + */ + + /* + * This initial dummy APERF read has been seen to + * reduce jitter in the subsequent reads. + */ + + if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) + return -3; + +retry: + t->tsc = rdtsc(); /* re-read close to APERF */ + + tsc_before = t->tsc; + + if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) + return -3; + + tsc_between = rdtsc(); + + if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf)) + return -4; + + tsc_after = rdtsc(); + + aperf_time = tsc_between - tsc_before; + mperf_time = tsc_after - tsc_between; + + /* + * If the system call latency to read APERF and MPERF + * differ by more than 2x, then try again. + */ + if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) { + aperf_mperf_retry_count++; + if (aperf_mperf_retry_count < 5) + goto retry; + else + warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time); + } + aperf_mperf_retry_count = 0; + + t->aperf = t->aperf * aperf_mperf_multiplier; + t->mperf = t->mperf * aperf_mperf_multiplier; + + return 0; +} + +size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci) +{ + size_t ret = 0; + + for (int i = 0; i < NUM_RAPL_COUNTERS; ++i) + if (rci->source[i] == RAPL_SOURCE_PERF) + ++ret; + + return ret; +} + +void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci, unsigned int idx) +{ + rc->raw_value = rci->data[idx]; + rc->unit = rci->unit[idx]; + rc->scale = rci->scale[idx]; +} + +int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data *p) +{ + unsigned long long perf_data[NUM_RAPL_COUNTERS + 1]; + struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain]; + + if (debug) + fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain); + + assert(rapl_counter_info_perdomain); + + /* + * If we have any perf counters to read, read them all now, in bulk + */ + if (rci->fd_perf != -1) { + size_t num_perf_counters = rapl_counter_info_count_perf(rci); + const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long); + const ssize_t actual_read_size = read(rci->fd_perf, &perf_data[0], sizeof(perf_data)); + + if (actual_read_size != expected_read_size) + err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, + actual_read_size); + } + + for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) { + switch (rci->source[i]) { + case RAPL_SOURCE_NONE: + break; + + case RAPL_SOURCE_PERF: + assert(pi < ARRAY_SIZE(perf_data)); + assert(rci->fd_perf != -1); + + if (debug) + fprintf(stderr, "Reading rapl counter via perf at %u (%llu %e %lf)\n", + i, perf_data[pi], rci->scale[i], perf_data[pi] * rci->scale[i]); + + rci->data[i] = perf_data[pi]; + + ++pi; + break; + + case RAPL_SOURCE_MSR: + if (debug) + fprintf(stderr, "Reading rapl counter via msr at %u\n", i); + + assert(!no_msr); + if (rci->flags[i] & RAPL_COUNTER_FLAG_USE_MSR_SUM) { + if (get_msr_sum(cpu, rci->msr[i], &rci->data[i])) + return -13 - i; + } else { + if (get_msr(cpu, rci->msr[i], &rci->data[i])) + return -13 - i; + } + + rci->data[i] &= rci->msr_mask[i]; + if (rci->msr_shift[i] >= 0) + rci->data[i] >>= abs(rci->msr_shift[i]); + else + rci->data[i] <<= abs(rci->msr_shift[i]); + + break; + } + } + + _Static_assert(NUM_RAPL_COUNTERS == 7); + write_rapl_counter(&p->energy_pkg, rci, RAPL_RCI_INDEX_ENERGY_PKG); + write_rapl_counter(&p->energy_cores, rci, RAPL_RCI_INDEX_ENERGY_CORES); + write_rapl_counter(&p->energy_dram, rci, RAPL_RCI_INDEX_DRAM); + write_rapl_counter(&p->energy_gfx, rci, RAPL_RCI_INDEX_GFX); + write_rapl_counter(&p->rapl_pkg_perf_status, rci, RAPL_RCI_INDEX_PKG_PERF_STATUS); + write_rapl_counter(&p->rapl_dram_perf_status, rci, RAPL_RCI_INDEX_DRAM_PERF_STATUS); + write_rapl_counter(&c->core_energy, rci, RAPL_RCI_INDEX_CORE_ENERGY); + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -2709,12 +3453,12 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int cpu = t->cpu_id; unsigned long long msr; - int aperf_mperf_retry_count = 0; struct msr_counter *mp; int i; + int status; if (cpu_migrate(cpu)) { - fprintf(outf, "get_counters: Could not migrate to CPU %d\n", cpu); + fprintf(outf, "%s: Could not migrate to CPU %d\n", __func__, cpu); return -1; } @@ -2722,63 +3466,26 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (first_counter_read) get_apic_id(t); -retry: + t->tsc = rdtsc(); /* we are running on local CPU of interest */ if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC) || soft_c1_residency_display(BIC_Avg_MHz)) { - unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time; - - /* - * The TSC, APERF and MPERF must be read together for - * APERF/MPERF and MPERF/TSC to give accurate results. - * - * Unfortunately, APERF and MPERF are read by - * individual system call, so delays may occur - * between them. If the time to read them - * varies by a large amount, we re-read them. - */ - - /* - * This initial dummy APERF read has been seen to - * reduce jitter in the subsequent reads. - */ - - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) - return -3; - - t->tsc = rdtsc(); /* re-read close to APERF */ - - tsc_before = t->tsc; + int status = -1; - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) - return -3; + assert(!no_perf || !no_msr); - tsc_between = rdtsc(); - - if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf)) - return -4; - - tsc_after = rdtsc(); - - aperf_time = tsc_between - tsc_before; - mperf_time = tsc_after - tsc_between; - - /* - * If the system call latency to read APERF and MPERF - * differ by more than 2x, then try again. - */ - if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) { - aperf_mperf_retry_count++; - if (aperf_mperf_retry_count < 5) - goto retry; - else - warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time); + switch (amperf_source) { + case AMPERF_SOURCE_PERF: + status = read_aperf_mperf_tsc_perf(t, cpu); + break; + case AMPERF_SOURCE_MSR: + status = read_aperf_mperf_tsc_msr(t, cpu); + break; } - aperf_mperf_retry_count = 0; - t->aperf = t->aperf * aperf_mperf_multiplier; - t->mperf = t->mperf * aperf_mperf_multiplier; + if (status != 0) + return status; } if (DO_BIC(BIC_IPC)) @@ -2806,6 +3513,12 @@ retry: if (!is_cpu_first_thread_in_core(t, c, p)) goto done; + if (platform->has_per_core_rapl) { + status = get_rapl_counters(cpu, c->core_id, c, p); + if (status != 0) + return status; + } + if (DO_BIC(BIC_CPU_c3) || soft_c1_residency_display(BIC_CPU_c3)) { if (get_msr(cpu, MSR_CORE_C3_RESIDENCY, &c->c3)) return -6; @@ -2846,12 +3559,6 @@ retry: if (DO_BIC(BIC_CORE_THROT_CNT)) get_core_throt_cnt(cpu, &c->core_throt_cnt); - if (platform->rapl_msrs & RAPL_AMD_F17H) { - if (get_msr(cpu, MSR_CORE_ENERGY_STAT, &msr)) - return -14; - c->core_energy = msr & 0xFFFFFFFF; - } - for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &c->counter[i])) return -10; @@ -2911,59 +3618,39 @@ retry: if (DO_BIC(BIC_SYS_LPI)) p->sys_lpi = cpuidle_cur_sys_lpi_us; - if (platform->rapl_msrs & RAPL_PKG) { - if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr)) - return -13; - p->energy_pkg = msr; - } - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) { - if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr)) - return -14; - p->energy_cores = msr; - } - if (platform->rapl_msrs & RAPL_DRAM) { - if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr)) - return -15; - p->energy_dram = msr; - } - if (platform->rapl_msrs & RAPL_GFX) { - if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr)) - return -16; - p->energy_gfx = msr; - } - if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) { - if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr)) - return -16; - p->rapl_pkg_perf_status = msr; - } - if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) { - if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr)) - return -16; - p->rapl_dram_perf_status = msr; - } - if (platform->rapl_msrs & RAPL_AMD_F17H) { - if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr)) - return -13; - p->energy_pkg = msr; + if (!platform->has_per_core_rapl) { + status = get_rapl_counters(cpu, p->package_id, c, p); + if (status != 0) + return status; } + if (DO_BIC(BIC_PkgTmp)) { if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) return -17; p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F); } - if (DO_BIC(BIC_GFX_rc6)) - p->gfx_rc6_ms = gfx_cur_rc6_ms; - /* n.b. assume die0 uncore frequency applies to whole package */ if (DO_BIC(BIC_UNCORE_MHZ)) p->uncore_mhz = get_uncore_mhz(p->package_id, 0); + if (DO_BIC(BIC_GFX_rc6)) + p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull; + if (DO_BIC(BIC_GFXMHz)) - p->gfx_mhz = gfx_cur_mhz; + p->gfx_mhz = gfx_info[GFX_MHz].val; if (DO_BIC(BIC_GFXACTMHz)) - p->gfx_act_mhz = gfx_act_mhz; + p->gfx_act_mhz = gfx_info[GFX_ACTMHz].val; + + if (DO_BIC(BIC_SAM_mc6)) + p->sam_mc6_ms = gfx_info[SAM_mc6].val_ull; + + if (DO_BIC(BIC_SAMMHz)) + p->sam_mhz = gfx_info[SAM_MHz].val; + + if (DO_BIC(BIC_SAMACTMHz)) + p->sam_act_mhz = gfx_info[SAM_ACTMHz].val; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &p->counter[i])) @@ -3053,7 +3740,7 @@ void probe_cst_limit(void) unsigned long long msr; int *pkg_cstate_limits; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; switch (platform->cst_limit) { @@ -3097,7 +3784,7 @@ static void dump_platform_info(void) unsigned long long msr; unsigned int ratio; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; get_msr(base_cpu, MSR_PLATFORM_INFO, &msr); @@ -3115,7 +3802,7 @@ static void dump_power_ctl(void) { unsigned long long msr; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr); @@ -3321,7 +4008,7 @@ static void dump_cst_cfg(void) { unsigned long long msr; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr); @@ -3393,7 +4080,7 @@ void print_irtl(void) { unsigned long long msr; - if (!platform->has_irtl_msrs) + if (!platform->has_irtl_msrs || no_msr) return; if (platform->supported_cstates & PC3) { @@ -3443,12 +4130,64 @@ void free_fd_percpu(void) { int i; + if (!fd_percpu) + return; + for (i = 0; i < topo.max_cpu_num + 1; ++i) { if (fd_percpu[i] != 0) close(fd_percpu[i]); } free(fd_percpu); + fd_percpu = NULL; +} + +void free_fd_amperf_percpu(void) +{ + int i; + + if (!fd_amperf_percpu) + return; + + for (i = 0; i < topo.max_cpu_num + 1; ++i) { + if (fd_amperf_percpu[i].mperf != 0) + close(fd_amperf_percpu[i].mperf); + + if (fd_amperf_percpu[i].aperf != 0) + close(fd_amperf_percpu[i].aperf); + } + + free(fd_amperf_percpu); + fd_amperf_percpu = NULL; +} + +void free_fd_instr_count_percpu(void) +{ + if (!fd_instr_count_percpu) + return; + + for (int i = 0; i < topo.max_cpu_num + 1; ++i) { + if (fd_instr_count_percpu[i] != 0) + close(fd_instr_count_percpu[i]); + } + + free(fd_instr_count_percpu); + fd_instr_count_percpu = NULL; +} + +void free_fd_rapl_percpu(void) +{ + if (!rapl_counter_info_perdomain) + return; + + const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages; + + for (int domain_id = 0; domain_id < num_domains; ++domain_id) { + if (rapl_counter_info_perdomain[domain_id].fd_perf != -1) + close(rapl_counter_info_perdomain[domain_id].fd_perf); + } + + free(rapl_counter_info_perdomain); } void free_all_buffers(void) @@ -3492,6 +4231,9 @@ void free_all_buffers(void) outp = NULL; free_fd_percpu(); + free_fd_instr_count_percpu(); + free_fd_amperf_percpu(); + free_fd_rapl_percpu(); free(irq_column_2_cpu); free(irqs_per_cpu); @@ -3825,11 +4567,17 @@ static void update_effective_set(bool startup) err(1, "%s: cpu str malformat %s\n", PATH_EFFECTIVE_CPUS, cpu_effective_str); } +void linux_perf_init(void); +void rapl_perf_init(void); + void re_initialize(void) { free_all_buffers(); setup_all_buffers(false); - fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); + linux_perf_init(); + rapl_perf_init(); + fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, + topo.allowed_cpus); } void set_max_cpu_num(void) @@ -3940,85 +4688,43 @@ int snapshot_proc_interrupts(void) } /* - * snapshot_gfx_rc6_ms() + * snapshot_graphics() * - * record snapshot of - * /sys/class/drm/card0/power/rc6_residency_ms + * record snapshot of specified graphics sysfs knob * * return 1 if config change requires a restart, else return 0 */ -int snapshot_gfx_rc6_ms(void) +int snapshot_graphics(int idx) { FILE *fp; int retval; - fp = fopen_or_die("/sys/class/drm/card0/power/rc6_residency_ms", "r"); - - retval = fscanf(fp, "%lld", &gfx_cur_rc6_ms); - if (retval != 1) - err(1, "GFX rc6"); - - fclose(fp); - - return 0; -} - -/* - * snapshot_gfx_mhz() - * - * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz - * when /sys/class/drm/card0/gt_cur_freq_mhz is not available. - * - * return 1 if config change requires a restart, else return 0 - */ -int snapshot_gfx_mhz(void) -{ - static FILE *fp; - int retval; - - if (fp == NULL) { - fp = fopen("/sys/class/drm/card0/gt_cur_freq_mhz", "r"); - if (!fp) - fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", "r"); - } else { - rewind(fp); - fflush(fp); - } - - retval = fscanf(fp, "%d", &gfx_cur_mhz); - if (retval != 1) - err(1, "GFX MHz"); - - return 0; -} - -/* - * snapshot_gfx_cur_mhz() - * - * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz - * when /sys/class/drm/card0/gt_act_freq_mhz is not available. - * - * return 1 if config change requires a restart, else return 0 - */ -int snapshot_gfx_act_mhz(void) -{ - static FILE *fp; - int retval; - - if (fp == NULL) { - fp = fopen("/sys/class/drm/card0/gt_act_freq_mhz", "r"); - if (!fp) - fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", "r"); - } else { - rewind(fp); - fflush(fp); + switch (idx) { + case GFX_rc6: + case SAM_mc6: + fp = fopen_or_die(gfx_info[idx].path, "r"); + retval = fscanf(fp, "%lld", &gfx_info[idx].val_ull); + if (retval != 1) + err(1, "rc6"); + fclose(fp); + return 0; + case GFX_MHz: + case GFX_ACTMHz: + case SAM_MHz: + case SAM_ACTMHz: + if (gfx_info[idx].fp == NULL) { + gfx_info[idx].fp = fopen_or_die(gfx_info[idx].path, "r"); + } else { + rewind(gfx_info[idx].fp); + fflush(gfx_info[idx].fp); + } + retval = fscanf(gfx_info[idx].fp, "%d", &gfx_info[idx].val); + if (retval != 1) + err(1, "MHz"); + return 0; + default: + return -EINVAL; } - - retval = fscanf(fp, "%d", &gfx_act_mhz); - if (retval != 1) - err(1, "GFX ACT MHz"); - - return 0; } /* @@ -4083,13 +4789,22 @@ int snapshot_proc_sysfs_files(void) return 1; if (DO_BIC(BIC_GFX_rc6)) - snapshot_gfx_rc6_ms(); + snapshot_graphics(GFX_rc6); if (DO_BIC(BIC_GFXMHz)) - snapshot_gfx_mhz(); + snapshot_graphics(GFX_MHz); if (DO_BIC(BIC_GFXACTMHz)) - snapshot_gfx_act_mhz(); + snapshot_graphics(GFX_ACTMHz); + + if (DO_BIC(BIC_SAM_mc6)) + snapshot_graphics(SAM_mc6); + + if (DO_BIC(BIC_SAMMHz)) + snapshot_graphics(SAM_MHz); + + if (DO_BIC(BIC_SAMACTMHz)) + snapshot_graphics(SAM_ACTMHz); if (DO_BIC(BIC_CPU_LPI)) snapshot_cpu_lpi_us(); @@ -4173,6 +4888,8 @@ int get_msr_sum(int cpu, off_t offset, unsigned long long *msr) int ret, idx; unsigned long long msr_cur, msr_last; + assert(!no_msr); + if (!per_cpu_msr_sum) return 1; @@ -4201,6 +4918,8 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg UNUSED(c); UNUSED(p); + assert(!no_msr); + for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) { unsigned long long msr_cur, msr_last; off_t offset; @@ -4280,7 +4999,8 @@ release_msr: /* * set_my_sched_priority(pri) - * return previous + * return previous priority on success + * return value < -20 on failure */ int set_my_sched_priority(int priority) { @@ -4290,16 +5010,16 @@ int set_my_sched_priority(int priority) errno = 0; original_priority = getpriority(PRIO_PROCESS, 0); if (errno && (original_priority == -1)) - err(errno, "getpriority"); + return -21; retval = setpriority(PRIO_PROCESS, 0, priority); if (retval) - errx(retval, "capget(CAP_SYS_NICE) failed,try \"# setcap cap_sys_nice=ep %s\"", progname); + return -21; errno = 0; retval = getpriority(PRIO_PROCESS, 0); if (retval != priority) - err(retval, "getpriority(%d) != setpriority(%d)", retval, priority); + return -21; return original_priority; } @@ -4314,6 +5034,9 @@ void turbostat_loop() /* * elevate own priority for interval mode + * + * ignore on error - we probably don't have permission to set it, but + * it's not a big deal */ set_my_sched_priority(-20); @@ -4399,10 +5122,13 @@ void check_dev_msr() struct stat sb; char pathname[32]; + if (no_msr) + return; + sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); if (stat(pathname, &sb)) if (system("/sbin/modprobe msr > /dev/null 2>&1")) - err(-5, "no /dev/cpu/0/msr, Try \"# modprobe msr\" "); + no_msr = 1; } /* @@ -4414,47 +5140,51 @@ int check_for_cap_sys_rawio(void) { cap_t caps; cap_flag_value_t cap_flag_value; + int ret = 0; caps = cap_get_proc(); if (caps == NULL) - err(-6, "cap_get_proc\n"); + return 1; - if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value)) - err(-6, "cap_get\n"); + if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value)) { + ret = 1; + goto free_and_exit; + } if (cap_flag_value != CAP_SET) { - warnx("capget(CAP_SYS_RAWIO) failed," " try \"# setcap cap_sys_rawio=ep %s\"", progname); - return 1; + ret = 1; + goto free_and_exit; } +free_and_exit: if (cap_free(caps) == -1) err(-6, "cap_free\n"); - return 0; + return ret; } -void check_permissions(void) +void check_msr_permission(void) { - int do_exit = 0; + int failed = 0; char pathname[32]; + if (no_msr) + return; + /* check for CAP_SYS_RAWIO */ - do_exit += check_for_cap_sys_rawio(); + failed += check_for_cap_sys_rawio(); /* test file permissions */ sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); if (euidaccess(pathname, R_OK)) { - do_exit++; - warn("/dev/cpu/0/msr open failed, try chown or chmod +r /dev/cpu/*/msr"); + failed++; } - /* if all else fails, thell them to be root */ - if (do_exit) - if (getuid() != 0) - warnx("... or simply run as root"); - - if (do_exit) - exit(-6); + if (failed) { + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", pathname, "--no-msr"); + no_msr = 1; + } } void probe_bclk(void) @@ -4462,7 +5192,7 @@ void probe_bclk(void) unsigned long long msr; unsigned int base_ratio; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; if (platform->bclk_freq == BCLK_100MHZ) @@ -4502,7 +5232,7 @@ static void dump_turbo_ratio_info(void) if (!has_turbo) return; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; if (platform->trl_msrs & TRL_LIMIT2) @@ -4567,20 +5297,15 @@ static void dump_sysfs_file(char *path) static void probe_intel_uncore_frequency(void) { int i, j; - char path[128]; + char path[256]; if (!genuine_intel) return; - if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00", R_OK)) - return; - - /* Cluster level sysfs not supported yet. */ - if (!access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00", R_OK)) - return; + if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) + goto probe_cluster; - if (!access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) - BIC_PRESENT(BIC_UNCORE_MHZ); + BIC_PRESENT(BIC_UNCORE_MHZ); if (quiet) return; @@ -4588,40 +5313,178 @@ static void probe_intel_uncore_frequency(void) for (i = 0; i < topo.num_packages; ++i) { for (j = 0; j < topo.num_die; ++j) { int k, l; + char path_base[128]; + + sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i, + j); - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/min_freq_khz", - i, j); + sprintf(path, "%s/min_freq_khz", path_base); k = read_sysfs_int(path); - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/max_freq_khz", - i, j); + sprintf(path, "%s/max_freq_khz", path_base); l = read_sysfs_int(path); - fprintf(outf, "Uncore Frequency pkg%d die%d: %d - %d MHz ", i, j, k / 1000, l / 1000); + fprintf(outf, "Uncore Frequency package%d die%d: %d - %d MHz ", i, j, k / 1000, l / 1000); - sprintf(path, - "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/initial_min_freq_khz", - i, j); + sprintf(path, "%s/initial_min_freq_khz", path_base); k = read_sysfs_int(path); - sprintf(path, - "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/initial_max_freq_khz", - i, j); + sprintf(path, "%s/initial_max_freq_khz", path_base); l = read_sysfs_int(path); - fprintf(outf, "(%d - %d MHz)\n", k / 1000, l / 1000); + fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000); + + sprintf(path, "%s/current_freq_khz", path_base); + k = read_sysfs_int(path); + fprintf(outf, " %d MHz\n", k / 1000); } } + return; + +probe_cluster: + if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00/current_freq_khz", R_OK)) + return; + + if (quiet) + return; + + for (i = 0;; ++i) { + int k, l; + char path_base[128]; + int package_id, domain_id, cluster_id; + + sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i); + + if (access(path_base, R_OK)) + break; + + sprintf(path, "%s/package_id", path_base); + package_id = read_sysfs_int(path); + + sprintf(path, "%s/domain_id", path_base); + domain_id = read_sysfs_int(path); + + sprintf(path, "%s/fabric_cluster_id", path_base); + cluster_id = read_sysfs_int(path); + + sprintf(path, "%s/min_freq_khz", path_base); + k = read_sysfs_int(path); + sprintf(path, "%s/max_freq_khz", path_base); + l = read_sysfs_int(path); + fprintf(outf, "Uncore Frequency package%d domain%d cluster%d: %d - %d MHz ", package_id, domain_id, + cluster_id, k / 1000, l / 1000); + + sprintf(path, "%s/initial_min_freq_khz", path_base); + k = read_sysfs_int(path); + sprintf(path, "%s/initial_max_freq_khz", path_base); + l = read_sysfs_int(path); + fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000); + + sprintf(path, "%s/current_freq_khz", path_base); + k = read_sysfs_int(path); + fprintf(outf, " %d MHz\n", k / 1000); + } } static void probe_graphics(void) { + /* Xe graphics sysfs knobs */ + if (!access("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms", R_OK)) { + FILE *fp; + char buf[8]; + bool gt0_is_gt; + int idx; + + fp = fopen("/sys/class/drm/card0/device/tile0/gt0/gtidle/name", "r"); + if (!fp) + goto next; + + if (!fread(buf, sizeof(char), 7, fp)) { + fclose(fp); + goto next; + } + fclose(fp); + + if (!strncmp(buf, "gt0-rc", strlen("gt0-rc"))) + gt0_is_gt = true; + else if (!strncmp(buf, "gt0-mc", strlen("gt0-mc"))) + gt0_is_gt = false; + else + goto next; + + idx = gt0_is_gt ? GFX_rc6 : SAM_mc6; + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms"; + + idx = gt0_is_gt ? GFX_MHz : SAM_MHz; + if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq"; + + idx = gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz; + if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq"; + + idx = gt0_is_gt ? SAM_mc6 : GFX_rc6; + if (!access("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms"; + + idx = gt0_is_gt ? SAM_MHz : GFX_MHz; + if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq"; + + idx = gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz; + if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq"; + + goto end; + } + +next: + /* New i915 graphics sysfs knobs */ + if (!access("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", R_OK)) { + gfx_info[GFX_rc6].path = "/sys/class/drm/card0/gt/gt0/rc6_residency_ms"; + + if (!access("/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz", R_OK)) + gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz"; + + if (!access("/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz", R_OK)) + gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz"; + + if (!access("/sys/class/drm/card0/gt/gt1/rc6_residency_ms", R_OK)) + gfx_info[SAM_mc6].path = "/sys/class/drm/card0/gt/gt1/rc6_residency_ms"; + + if (!access("/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz", R_OK)) + gfx_info[SAM_MHz].path = "/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz"; + + if (!access("/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz", R_OK)) + gfx_info[SAM_ACTMHz].path = "/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz"; + + goto end; + } + + /* Fall back to traditional i915 graphics sysfs knobs */ if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) - BIC_PRESENT(BIC_GFX_rc6); + gfx_info[GFX_rc6].path = "/sys/class/drm/card0/power/rc6_residency_ms"; + + if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK)) + gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt_cur_freq_mhz"; + else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) + gfx_info[GFX_MHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz"; - if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK) || - !access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) - BIC_PRESENT(BIC_GFXMHz); - if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK) || - !access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) + if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK)) + gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt_act_freq_mhz"; + else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) + gfx_info[GFX_ACTMHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz"; + +end: + if (gfx_info[GFX_rc6].path) + BIC_PRESENT(BIC_GFX_rc6); + if (gfx_info[GFX_MHz].path) + BIC_PRESENT(BIC_GFXMHz); + if (gfx_info[GFX_ACTMHz].path) BIC_PRESENT(BIC_GFXACTMHz); + if (gfx_info[SAM_mc6].path) + BIC_PRESENT(BIC_SAM_mc6); + if (gfx_info[SAM_MHz].path) + BIC_PRESENT(BIC_SAMMHz); + if (gfx_info[SAM_ACTMHz].path) + BIC_PRESENT(BIC_SAMACTMHz); } static void dump_sysfs_cstate_config(void) @@ -4783,6 +5646,9 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) UNUSED(c); UNUSED(p); + if (no_msr) + return 0; + if (!has_hwp) return 0; @@ -4869,6 +5735,9 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data UNUSED(c); UNUSED(p); + if (no_msr) + return 0; + cpu = t->cpu_id; /* per-package */ @@ -4983,31 +5852,18 @@ void rapl_probe_intel(void) unsigned long long msr; unsigned int time_unit; double tdp; + const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt | BIC_RAMWatt | BIC_GFXWatt; + const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J | BIC_RAM_J | BIC_GFX_J; - if (rapl_joules) { - if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS) - BIC_PRESENT(BIC_Pkg_J); - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) - BIC_PRESENT(BIC_Cor_J); - if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS) - BIC_PRESENT(BIC_RAM_J); - if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS) - BIC_PRESENT(BIC_GFX_J); - } else { - if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS) - BIC_PRESENT(BIC_PkgWatt); - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) - BIC_PRESENT(BIC_CorWatt); - if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS) - BIC_PRESENT(BIC_RAMWatt); - if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS) - BIC_PRESENT(BIC_GFXWatt); - } + if (rapl_joules) + bic_enabled &= ~bic_watt_bits; + else + bic_enabled &= ~bic_joules_bits; - if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) - BIC_PRESENT(BIC_PKG__); - if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) - BIC_PRESENT(BIC_RAM__); + if (!(platform->rapl_msrs & RAPL_PKG_PERF_STATUS)) + bic_enabled &= ~BIC_PKG__; + if (!(platform->rapl_msrs & RAPL_DRAM_PERF_STATUS)) + bic_enabled &= ~BIC_RAM__; /* units on package 0, verify later other packages match */ if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr)) @@ -5041,14 +5897,13 @@ void rapl_probe_amd(void) { unsigned long long msr; double tdp; + const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt; + const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J; - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_Cor_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_CorWatt); - } + if (rapl_joules) + bic_enabled &= ~bic_watt_bits; + else + bic_enabled &= ~bic_joules_bits; if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr)) return; @@ -5202,7 +6057,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) */ void probe_rapl(void) { - if (!platform->rapl_msrs) + if (!platform->rapl_msrs || no_msr) return; if (genuine_intel) @@ -5258,7 +6113,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk } /* Temperature Target MSR is Nehalem and newer only */ - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) goto guess; if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) @@ -5305,6 +6160,9 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p UNUSED(c); UNUSED(p); + if (no_msr) + return 0; + if (!(do_dts || do_ptm)) return 0; @@ -5402,6 +6260,9 @@ void decode_feature_control_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr)) fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n", base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : ""); @@ -5411,6 +6272,9 @@ void decode_misc_enable_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!genuine_intel) return; @@ -5428,6 +6292,9 @@ void decode_misc_feature_control(void) { unsigned long long msr; + if (no_msr) + return; + if (!platform->has_msr_misc_feature_control) return; @@ -5449,6 +6316,9 @@ void decode_misc_pwr_mgmt_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!platform->has_msr_misc_pwr_mgmt) return; @@ -5468,6 +6338,9 @@ void decode_c6_demotion_policy_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!platform->has_msr_c6_demotion_policy_config) return; @@ -5489,7 +6362,8 @@ void print_dev_latency(void) fd = open(path, O_RDONLY); if (fd < 0) { - warnx("capget(CAP_SYS_ADMIN) failed, try \"# setcap cap_sys_admin=ep %s\"", progname); + if (debug) + warnx("Read %s failed", path); return; } @@ -5504,23 +6378,260 @@ void print_dev_latency(void) close(fd); } +static int has_instr_count_access(void) +{ + int fd; + int has_access; + + if (no_perf) + return 0; + + fd = open_perf_counter(base_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); + has_access = fd != -1; + + if (fd != -1) + close(fd); + + if (!has_access) + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", + "instructions retired perf counter", "--no-perf"); + + return has_access; +} + +bool is_aperf_access_required(void) +{ + return BIC_IS_ENABLED(BIC_Avg_MHz) + || BIC_IS_ENABLED(BIC_Busy) + || BIC_IS_ENABLED(BIC_Bzy_MHz) + || BIC_IS_ENABLED(BIC_IPC); +} + +int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, + double *scale_, enum rapl_unit *unit_) +{ + if (no_perf) + return -1; + + const double scale = read_perf_rapl_scale(cai->perf_subsys, cai->perf_name); + + if (scale == 0.0) + return -1; + + const enum rapl_unit unit = read_perf_rapl_unit(cai->perf_subsys, cai->perf_name); + + if (unit == RAPL_UNIT_INVALID) + return -1; + + const unsigned int rapl_type = read_perf_type(cai->perf_subsys); + const unsigned int rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name); + + const int fd_counter = + open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP); + if (fd_counter == -1) + return -1; + + /* If it's the first counter opened, make it a group descriptor */ + if (rci->fd_perf == -1) + rci->fd_perf = fd_counter; + + *scale_ = scale; + *unit_ = unit; + return fd_counter; +} + +int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, + double *scale, enum rapl_unit *unit) +{ + int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit); + + if (debug) + fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu); + + return ret; +} + /* * Linux-perf manages the HW instructions-retired counter * by enabling when requested, and hiding rollover */ void linux_perf_init(void) { - if (!BIC_IS_ENABLED(BIC_IPC)) - return; - if (access("/proc/sys/kernel/perf_event_paranoid", F_OK)) return; - fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); - if (fd_instr_count_percpu == NULL) - err(-1, "calloc fd_instr_count_percpu"); + if (BIC_IS_ENABLED(BIC_IPC) && has_aperf) { + fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); + if (fd_instr_count_percpu == NULL) + err(-1, "calloc fd_instr_count_percpu"); + } + + const bool aperf_required = is_aperf_access_required(); + + if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) { + fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu)); + if (fd_amperf_percpu == NULL) + err(-1, "calloc fd_amperf_percpu"); + } +} + +void rapl_perf_init(void) +{ + const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages; + bool *domain_visited = calloc(num_domains, sizeof(bool)); + + rapl_counter_info_perdomain = calloc(num_domains, sizeof(*rapl_counter_info_perdomain)); + if (rapl_counter_info_perdomain == NULL) + err(-1, "calloc rapl_counter_info_percpu"); + + /* + * Initialize rapl_counter_info_percpu + */ + for (int domain_id = 0; domain_id < num_domains; ++domain_id) { + struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain_id]; + + rci->fd_perf = -1; + for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) { + rci->data[i] = 0; + rci->source[i] = RAPL_SOURCE_NONE; + } + } - BIC_PRESENT(BIC_IPC); + /* + * Open/probe the counters + * If can't get it via perf, fallback to MSR + */ + for (size_t i = 0; i < ARRAY_SIZE(rapl_counter_arch_infos); ++i) { + + const struct rapl_counter_arch_info *const cai = &rapl_counter_arch_infos[i]; + bool has_counter = 0; + double scale; + enum rapl_unit unit; + int next_domain; + + memset(domain_visited, 0, num_domains * sizeof(*domain_visited)); + + for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) { + + if (cpu_is_not_allowed(cpu)) + continue; + + /* Skip already seen and handled RAPL domains */ + next_domain = + platform->has_per_core_rapl ? cpus[cpu].physical_core_id : cpus[cpu].physical_package_id; + + if (domain_visited[next_domain]) + continue; + + domain_visited[next_domain] = 1; + + struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[next_domain]; + + /* Check if the counter is enabled and accessible */ + if (BIC_IS_ENABLED(cai->bic) && (platform->rapl_msrs & cai->feature_mask)) { + + /* Use perf API for this counter */ + if (!no_perf && cai->perf_name + && add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) { + rci->source[cai->rci_index] = RAPL_SOURCE_PERF; + rci->scale[cai->rci_index] = scale * cai->compat_scale; + rci->unit[cai->rci_index] = unit; + rci->flags[cai->rci_index] = cai->flags; + + /* Use MSR for this counter */ + } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) { + rci->source[cai->rci_index] = RAPL_SOURCE_MSR; + rci->msr[cai->rci_index] = cai->msr; + rci->msr_mask[cai->rci_index] = cai->msr_mask; + rci->msr_shift[cai->rci_index] = cai->msr_shift; + rci->unit[cai->rci_index] = RAPL_UNIT_JOULES; + rci->scale[cai->rci_index] = *cai->platform_rapl_msr_scale * cai->compat_scale; + rci->flags[cai->rci_index] = cai->flags; + } + } + + if (rci->source[cai->rci_index] != RAPL_SOURCE_NONE) + has_counter = 1; + } + + /* If any CPU has access to the counter, make it present */ + if (has_counter) + BIC_PRESENT(cai->bic); + } + + free(domain_visited); +} + +static int has_amperf_access_via_msr(void) +{ + if (no_msr) + return 0; + + if (probe_msr(base_cpu, MSR_IA32_APERF)) + return 0; + + if (probe_msr(base_cpu, MSR_IA32_MPERF)) + return 0; + + return 1; +} + +static int has_amperf_access_via_perf(void) +{ + struct amperf_group_fd fds; + + /* + * Cache the last result, so we don't warn the user multiple times + * + * Negative means cached, no access + * Zero means not cached + * Positive means cached, has access + */ + static int has_access_cached; + + if (no_perf) + return 0; + + if (has_access_cached != 0) + return has_access_cached > 0; + + fds = open_amperf_fd(base_cpu); + has_access_cached = (fds.aperf != -1) && (fds.mperf != -1); + + if (fds.aperf == -1) + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", + "APERF perf counter", "--no-perf"); + else + close(fds.aperf); + + if (fds.mperf == -1) + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", + "MPERF perf counter", "--no-perf"); + else + close(fds.mperf); + + if (has_access_cached == 0) + has_access_cached = -1; + + return has_access_cached > 0; +} + +/* Check if we can access APERF and MPERF */ +static int has_amperf_access(void) +{ + if (!is_aperf_access_required()) + return 0; + + if (!no_msr && has_amperf_access_via_msr()) + return 1; + + if (!no_perf && has_amperf_access_via_perf()) + return 1; + + return 0; } void probe_cstates(void) @@ -5563,7 +6674,7 @@ void probe_cstates(void) if (platform->has_msr_module_c6_res_ms) BIC_PRESENT(BIC_Mod_c6); - if (platform->has_ext_cst_msrs) { + if (platform->has_ext_cst_msrs && !no_msr) { BIC_PRESENT(BIC_Totl_c0); BIC_PRESENT(BIC_Any_c0); BIC_PRESENT(BIC_GFX_c0); @@ -5623,6 +6734,7 @@ void process_cpuid() unsigned int eax, ebx, ecx, edx; unsigned int fms, family, model, stepping, ecx_flags, edx_flags; unsigned long long ucode_patch = 0; + bool ucode_patch_valid = false; eax = ebx = ecx = edx = 0; @@ -5650,8 +6762,12 @@ void process_cpuid() ecx_flags = ecx; edx_flags = edx; - if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch)) - warnx("get_msr(UCODE)"); + if (!no_msr) { + if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch)) + warnx("get_msr(UCODE)"); + else + ucode_patch_valid = true; + } /* * check max extended function levels of CPUID. @@ -5662,9 +6778,12 @@ void process_cpuid() __cpuid(0x80000000, max_extended_level, ebx, ecx, edx); if (!quiet) { - fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d) microcode 0x%x\n", - family, model, stepping, family, model, stepping, - (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF)); + fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d)", + family, model, stepping, family, model, stepping); + if (ucode_patch_valid) + fprintf(outf, " microcode 0x%x", (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF)); + fputc('\n', outf); + fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level); fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n", ecx_flags & (1 << 0) ? "SSE3" : "-", @@ -5700,10 +6819,11 @@ void process_cpuid() __cpuid(0x6, eax, ebx, ecx, edx); has_aperf = ecx & (1 << 0); - if (has_aperf) { + if (has_aperf && has_amperf_access()) { BIC_PRESENT(BIC_Avg_MHz); BIC_PRESENT(BIC_Busy); BIC_PRESENT(BIC_Bzy_MHz); + BIC_PRESENT(BIC_IPC); } do_dts = eax & (1 << 0); if (do_dts) @@ -5786,6 +6906,15 @@ void process_cpuid() base_mhz = max_mhz = bus_mhz = edx = 0; __cpuid(0x16, base_mhz, max_mhz, bus_mhz, edx); + + bclk = bus_mhz; + + base_hz = base_mhz * 1000000; + has_base_hz = 1; + + if (platform->enable_tsc_tweak) + tsc_tweak = base_hz / tsc_hz; + if (!quiet) fprintf(outf, "CPUID(0x16): base_mhz: %d max_mhz: %d bus_mhz: %d\n", base_mhz, max_mhz, bus_mhz); @@ -5814,7 +6943,7 @@ void probe_pm_features(void) probe_thermal(); - if (platform->has_nhm_msrs) + if (platform->has_nhm_msrs && !no_msr) BIC_PRESENT(BIC_SMI); if (!quiet) @@ -6142,6 +7271,7 @@ void topology_update(void) topo.allowed_packages = 0; for_all_cpus(update_topo, ODD_COUNTERS); } + void setup_all_buffers(bool startup) { topology_probe(startup); @@ -6169,21 +7299,129 @@ void set_base_cpu(void) err(-ENODEV, "No valid cpus found"); } +static void set_amperf_source(void) +{ + amperf_source = AMPERF_SOURCE_PERF; + + const bool aperf_required = is_aperf_access_required(); + + if (no_perf || !aperf_required || !has_amperf_access_via_perf()) + amperf_source = AMPERF_SOURCE_MSR; + + if (quiet || !debug) + return; + + fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf"); +} + +bool has_added_counters(void) +{ + /* + * It only makes sense to call this after the command line is parsed, + * otherwise sys structure is not populated. + */ + + return sys.added_core_counters | sys.added_thread_counters | sys.added_package_counters; +} + +bool is_msr_access_required(void) +{ + if (no_msr) + return false; + + if (has_added_counters()) + return true; + + return BIC_IS_ENABLED(BIC_SMI) + || BIC_IS_ENABLED(BIC_CPU_c1) + || BIC_IS_ENABLED(BIC_CPU_c3) + || BIC_IS_ENABLED(BIC_CPU_c6) + || BIC_IS_ENABLED(BIC_CPU_c7) + || BIC_IS_ENABLED(BIC_Mod_c6) + || BIC_IS_ENABLED(BIC_CoreTmp) + || BIC_IS_ENABLED(BIC_Totl_c0) + || BIC_IS_ENABLED(BIC_Any_c0) + || BIC_IS_ENABLED(BIC_GFX_c0) + || BIC_IS_ENABLED(BIC_CPUGFX) + || BIC_IS_ENABLED(BIC_Pkgpc3) + || BIC_IS_ENABLED(BIC_Pkgpc6) + || BIC_IS_ENABLED(BIC_Pkgpc2) + || BIC_IS_ENABLED(BIC_Pkgpc7) + || BIC_IS_ENABLED(BIC_Pkgpc8) + || BIC_IS_ENABLED(BIC_Pkgpc9) + || BIC_IS_ENABLED(BIC_Pkgpc10) + /* TODO: Multiplex access with perf */ + || BIC_IS_ENABLED(BIC_CorWatt) + || BIC_IS_ENABLED(BIC_Cor_J) + || BIC_IS_ENABLED(BIC_PkgWatt) + || BIC_IS_ENABLED(BIC_CorWatt) + || BIC_IS_ENABLED(BIC_GFXWatt) + || BIC_IS_ENABLED(BIC_RAMWatt) + || BIC_IS_ENABLED(BIC_Pkg_J) + || BIC_IS_ENABLED(BIC_Cor_J) + || BIC_IS_ENABLED(BIC_GFX_J) + || BIC_IS_ENABLED(BIC_RAM_J) + || BIC_IS_ENABLED(BIC_PKG__) + || BIC_IS_ENABLED(BIC_RAM__) + || BIC_IS_ENABLED(BIC_PkgTmp) + || (is_aperf_access_required() && !has_amperf_access_via_perf()); +} + +void check_msr_access(void) +{ + if (!is_msr_access_required()) + no_msr = 1; + + check_dev_msr(); + check_msr_permission(); + + if (no_msr) + bic_disable_msr_access(); +} + +void check_perf_access(void) +{ + const bool intrcount_required = BIC_IS_ENABLED(BIC_IPC); + + if (no_perf || !intrcount_required || !has_instr_count_access()) + bic_enabled &= ~BIC_IPC; + + const bool aperf_required = is_aperf_access_required(); + + if (!aperf_required || !has_amperf_access()) { + bic_enabled &= ~BIC_Avg_MHz; + bic_enabled &= ~BIC_Busy; + bic_enabled &= ~BIC_Bzy_MHz; + bic_enabled &= ~BIC_IPC; + } +} + void turbostat_init() { setup_all_buffers(true); set_base_cpu(); - check_dev_msr(); - check_permissions(); + check_msr_access(); + check_perf_access(); process_cpuid(); probe_pm_features(); + set_amperf_source(); linux_perf_init(); + rapl_perf_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); if (DO_BIC(BIC_IPC)) (void)get_instr_count_fd(base_cpu); + + /* + * If TSC tweak is needed, but couldn't get it, + * disable more BICs, since it can't be reported accurately. + */ + if (platform->enable_tsc_tweak && !has_base_hz) { + bic_enabled &= ~BIC_Busy; + bic_enabled &= ~BIC_Bzy_MHz; + } } int fork_it(char **argv) @@ -6259,7 +7497,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2023.11.07 - Len Brown <lenb@kernel.org>\n"); + fprintf(outf, "turbostat version 2024.04.08 - Len Brown <lenb@kernel.org>\n"); } #define COMMAND_LINE_SIZE 2048 @@ -6291,6 +7529,9 @@ int add_counter(unsigned int msr_num, char *path, char *name, { struct msr_counter *msrp; + if (no_msr && msr_num) + errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num); + msrp = calloc(1, sizeof(struct msr_counter)); if (msrp == NULL) { perror("calloc"); @@ -6595,6 +7836,8 @@ void cmdline(int argc, char **argv) { "list", no_argument, 0, 'l' }, { "out", required_argument, 0, 'o' }, { "quiet", no_argument, 0, 'q' }, + { "no-msr", no_argument, 0, 'M' }, + { "no-perf", no_argument, 0, 'P' }, { "show", required_argument, 0, 's' }, { "Summary", no_argument, 0, 'S' }, { "TCC", required_argument, 0, 'T' }, @@ -6604,7 +7847,25 @@ void cmdline(int argc, char **argv) progname = argv[0]; - while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qST:v", long_options, &option_index)) != -1) { + /* + * Parse some options early, because they may make other options invalid, + * like adding the MSR counter with --add and at the same time using --no-msr. + */ + while ((opt = getopt_long_only(argc, argv, "MP", long_options, &option_index)) != -1) { + switch (opt) { + case 'M': + no_msr = 1; + break; + case 'P': + no_perf = 1; + break; + default: + break; + } + } + optind = 0; + + while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qMST:v", long_options, &option_index)) != -1) { switch (opt) { case 'a': parse_add_command(optarg); @@ -6662,6 +7923,10 @@ void cmdline(int argc, char **argv) case 'q': quiet = 1; break; + case 'M': + case 'P': + /* Parsed earlier */ + break; case 'n': num_iterations = strtod(optarg, NULL); @@ -6704,6 +7969,22 @@ void cmdline(int argc, char **argv) } } +void set_rlimit(void) +{ + struct rlimit limit; + + if (getrlimit(RLIMIT_NOFILE, &limit) < 0) + err(1, "Failed to get rlimit"); + + if (limit.rlim_max < MAX_NOFILE) + limit.rlim_max = MAX_NOFILE; + if (limit.rlim_cur < MAX_NOFILE) + limit.rlim_cur = MAX_NOFILE; + + if (setrlimit(RLIMIT_NOFILE, &limit) < 0) + err(1, "Failed to set rlimit"); +} + int main(int argc, char **argv) { int fd, ret; @@ -6729,9 +8010,13 @@ skip_cgroup_setting: probe_sysfs(); + if (!getuid()) + set_rlimit(); + turbostat_init(); - msr_sum_record(); + if (!no_msr) + msr_sum_record(); /* dump counters and exit */ if (dump_only) diff --git a/tools/testing/selftests/turbostat/defcolumns.py b/tools/testing/selftests/turbostat/defcolumns.py new file mode 100755 index 000000000000..d9b042097da7 --- /dev/null +++ b/tools/testing/selftests/turbostat/defcolumns.py @@ -0,0 +1,60 @@ +#!/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +from shutil import which + +turbostat = which('turbostat') +if turbostat is None: + print('Could not find turbostat binary') + exit(1) + +timeout = which('timeout') +if timeout is None: + print('Could not find timeout binary') + exit(1) + +proc_turbostat = subprocess.run([turbostat, '--list'], capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +# +# By default --list reports also "usec" and "Time_Of_Day_Seconds" columns +# which are only visible when running with --debug. +# +expected_columns_debug = proc_turbostat.stdout.replace(b',', b'\t').strip() +expected_columns = expected_columns_debug.replace(b'usec\t', b'').replace(b'Time_Of_Day_Seconds\t', b'').replace(b'X2APIC\t', b'').replace(b'APIC\t', b'') + +# +# Run turbostat with no options for 10 seconds and send SIGINT +# +timeout_argv = [timeout, '--preserve-status', '-s', 'SIGINT', '-k', '3', '1s'] +turbostat_argv = [turbostat, '-i', '0.250'] + +print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True) +proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) +actual_columns = proc_turbostat.stdout.split(b'\n')[0] +if expected_columns != actual_columns: + print(f'turbostat column check failed\n{expected_columns=}\n{actual_columns=}') + exit(1) +print('OK') + +# +# Same, but with --debug +# +turbostat_argv.append('--debug') + +print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True) +proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) +actual_columns = proc_turbostat.stdout.split(b'\n')[0] +if expected_columns_debug != actual_columns: + print(f'turbostat column check failed\n{expected_columns_debug=}\n{actual_columns=}') + exit(1) +print('OK') |