summaryrefslogtreecommitdiff
AgeCommit message (Collapse)Author
2024-09-11sock_map: Add a cond_resched() in sock_hash_free()Eric Dumazet
Several syzbot soft lockup reports all have in common sock_hash_free() If a map with a large number of buckets is destroyed, we need to yield the cpu when needed. Fixes: 75e68e5bf2c7 ("bpf, sockhash: Synchronize delete from bucket list on map free") Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Martin KaFai Lau <martin.lau@kernel.org> Acked-by: John Fastabend <john.fastabend@gmail.com> Link: https://lore.kernel.org/bpf/20240906154449.3742932-1-edumazet@google.com
2024-09-11perf trace: Mark the 'rseq' arg in the rseq syscall as coming from user spaceArnaldo Carvalho de Melo
With that it uses the generic BTF based pretty printer: root@number:~# grep -w rseq /sys/kernel/tracing/events/syscalls/sys_enter_rseq/format field:struct rseq * rseq; offset:16; size:8; signed:0; print fmt: "rseq: 0x%08lx, rseq_len: 0x%08lx, flags: 0x%08lx, sig: 0x%08lx", ((unsigned long)(REC->rseq)), ((unsigned long)(REC->rseq_len)), ((unsigned long)(REC->flags)), ((unsigned long)(REC->sig)) root@number:~# Before: root@number:~# perf trace -e rseq 0.000 ( 0.017 ms): Isolated Web C/1195452 rseq(rseq: 0x7ff0ecfe6fe0, rseq_len: 32, sig: 1392848979) = 0 74.018 ( 0.006 ms): :1195453/1195453 rseq(rseq: 0x7f2af20fffe0, rseq_len: 32, sig: 1392848979) = 0 1817.220 ( 0.009 ms): Isolated Web C/1195454 rseq(rseq: 0x7f5c9ec7dfe0, rseq_len: 32, sig: 1392848979) = 0 2515.526 ( 0.034 ms): :1195455/1195455 rseq(rseq: 0x7f61503fffe0, rseq_len: 32, sig: 1392848979) = 0 ^Croot@number:~# After: root@number:~# perf trace -e rseq 0.000 ( 0.019 ms): Isolated Web C/1197258 rseq(rseq: (struct rseq){.cpu_id_start = (__u32)4,.cpu_id = (__u32)4,.mm_cid = (__u32)5,}, rseq_len: 32, sig: 1392848979) = 0 1663.835 ( 0.019 ms): Isolated Web C/1197259 rseq(rseq: (struct rseq){.cpu_id_start = (__u32)24,.cpu_id = (__u32)24,.mm_cid = (__u32)2,}, rseq_len: 32, sig: 1392848979) = 0 4750.444 ( 0.018 ms): Isolated Web C/1197260 rseq(rseq: (struct rseq){.cpu_id_start = (__u32)8,.cpu_id = (__u32)8,.mm_cid = (__u32)4,}, rseq_len: 32, sig: 1392848979) = 0 4994.132 ( 0.018 ms): Isolated Web C/1197261 rseq(rseq: (struct rseq){.cpu_id_start = (__u32)10,.cpu_id = (__u32)10,.mm_cid = (__u32)1,}, rseq_len: 32, sig: 1392848979) = 0 4997.578 ( 0.011 ms): Isolated Web C/1197263 rseq(rseq: (struct rseq){.cpu_id_start = (__u32)16,.cpu_id = (__u32)16,.mm_cid = (__u32)4,}, rseq_len: 32, sig: 1392848979) = 0 4997.462 ( 0.014 ms): Isolated Web C/1197262 rseq(rseq: (struct rseq){.cpu_id_start = (__u32)17,.cpu_id = (__u32)17,.mm_cid = (__u32)3,}, rseq_len: 32, sig: 1392848979) = 0 ^Croot@number:~# We'll probably need to come up with some way for using the BTF info to synthesize a test that then gets used and captures the output of the 'perf trace' output to check if the arguments are the ones synthesized, randomically, for now, lets make do manually: root@number:~# cat ~acme/c/rseq.c #include <sys/syscall.h> /* Definition of SYS_* constants */ #include <linux/rseq.h> #include <errno.h> #include <string.h> #include <unistd.h> #include <stdint.h> #include <stdio.h> /* Provide own rseq stub because glibc doesn't */ __attribute__((weak)) int sys_rseq(struct rseq *rseq, __u32 rseq_len, int flags, __u32 sig) { return syscall(SYS_rseq, rseq, rseq_len, flags, sig); } int main(int argc, char *argv[]) { struct rseq rseq = { .cpu_id_start = 12, .cpu_id = 34, .rseq_cs = 56, .flags = 78, .node_id = 90, .mm_cid = 12, }; int err = sys_rseq(&rseq, sizeof(rseq), 98765, 0xdeadbeaf); printf("sys_rseq({ .cpu_id_start = 12, .cpu_id = 34, .rseq_cs = 56, .flags = 78, .node_id = 90, .mm_cid = 12, }, %d, 0) = %d (%s)\n", sizeof(rseq), err, strerror(errno)); return err; } root@number:~# perf trace -e rseq ~acme/c/rseq sys_rseq({ .cpu_id_start = 12, .cpu_id = 34, .rseq_cs = 56, .flags = 78, .node_id = 90, .mm_cid = 12, }, 32, 0) = -1 (Invalid argument) 0.000 ( 0.003 ms): rseq/1200640 rseq(rseq: (struct rseq){}, rseq_len: 32, sig: 1392848979) = 0.064 ( 0.001 ms): rseq/1200640 rseq(rseq: (struct rseq){.cpu_id_start = (__u32)12,.cpu_id = (__u32)34,.rseq_cs = (__u64)56,.flags = (__u32)78,.node_id = (__u32)90,.mm_cid = (__u32)12,}, rseq_len: 32, flags: 98765, sig: 3735928495) = -1 EINVAL (Invalid argument) root@number:~#root@number:~# cat ~acme/c/rseq.c #include <sys/syscall.h> /* Definition of SYS_* constants */ #include <linux/rseq.h> #include <errno.h> #include <string.h> #include <unistd.h> #include <stdint.h> #include <stdio.h> /* Provide own rseq stub because glibc doesn't */ __attribute__((weak)) int sys_rseq(struct rseq *rseq, __u32 rseq_len, int flags, __u32 sig) { return syscall(SYS_rseq, rseq, rseq_len, flags, sig); } int main(int argc, char *argv[]) { struct rseq rseq = { .cpu_id_start = 12, .cpu_id = 34, .rseq_cs = 56, .flags = 78, .node_id = 90, .mm_cid = 12, }; int err = sys_rseq(&rseq, sizeof(rseq), 98765, 0xdeadbeaf); printf("sys_rseq({ .cpu_id_start = 12, .cpu_id = 34, .rseq_cs = 56, .flags = 78, .node_id = 90, .mm_cid = 12, }, %d, 0) = %d (%s)\n", sizeof(rseq), err, strerror(errno)); return err; } root@number:~# perf trace -e rseq ~acme/c/rseq sys_rseq({ .cpu_id_start = 12, .cpu_id = 34, .rseq_cs = 56, .flags = 78, .node_id = 90, .mm_cid = 12, }, 32, 0) = -1 (Invalid argument) 0.000 ( 0.003 ms): rseq/1200640 rseq(rseq: (struct rseq){}, rseq_len: 32, sig: 1392848979) = 0.064 ( 0.001 ms): rseq/1200640 rseq(rseq: (struct rseq){.cpu_id_start = (__u32)12,.cpu_id = (__u32)34,.rseq_cs = (__u64)56,.flags = (__u32)78,.node_id = (__u32)90,.mm_cid = (__u32)12,}, rseq_len: 32, flags: 98765, sig: 3735928495) = -1 EINVAL (Invalid argument) root@number:~# Interesting, glibc seems to be using rseq here, as in addition to the totally fake one this test case uses, we have this one, around these other syscalls: 0.175 ( 0.001 ms): rseq/1201095 set_tid_address(tidptr: 0x7f6def759a10) = 1201095 (rseq) 0.177 ( 0.001 ms): rseq/1201095 set_robust_list(head: 0x7f6def759a20, len: 24) = 0 0.178 ( 0.001 ms): rseq/1201095 rseq(rseq: (struct rseq){}, rseq_len: 32, sig: 1392848979) = 0.231 ( 0.005 ms): rseq/1201095 mprotect(start: 0x7f6def93f000, len: 16384, prot: READ) = 0 0.238 ( 0.003 ms): rseq/1201095 mprotect(start: 0x403000, len: 4096, prot: READ) = 0 0.244 ( 0.004 ms): rseq/1201095 mprotect(start: 0x7f6def99c000, len: 8192, prot: READ) Matches strace (well, not really as the strace in fedora:40 doesn't know about rseq, printing just integer values in hex): set_robust_list(0x7fbc6acc7a20, 24) = 0 rseq(0x7fbc6acc8060, 0x20, 0, 0x53053053) = 0 mprotect(0x7fbc6aead000, 16384, PROT_READ) = 0 mprotect(0x403000, 4096, PROT_READ) = 0 mprotect(0x7fbc6af0a000, 8192, PROT_READ) = 0 prlimit64(0, RLIMIT_STACK, NULL, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0 munmap(0x7fbc6aebd000, 81563) = 0 rseq(0x7fff15bb9920, 0x20, 0x181cd, 0xdeadbeaf) = -1 EINVAL (Invalid argument) fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(0x88, 0x9), ...}) = 0 getrandom("\xd0\x34\x97\x17\x61\xc2\x2b\x10", 8, GRND_NONBLOCK) = 8 brk(NULL) = 0x18ff4000 brk(0x19015000) = 0x19015000 write(1, "sys_rseq({ .cpu_id_start = 12, ."..., 136sys_rseq({ .cpu_id_start = 12, .cpu_id = 34, .rseq_cs = 56, .flags = 78, .node_id = 90, .mm_cid = 12, }, 32, 0) = -1 (Invalid argument) ) = 136 exit_group(-1) = ? +++ exited with 255 +++ root@number:~# And also the focus for the v6.13 should be to have a better, strace like BTF pretty printer as one of the outputs we can get from the libbpf BTF dumper. Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alan Maguire <alan.maguire@oracle.com> Cc: Howard Chu <howardchu95@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/lkml/ZuH2K1LLt1pIDkbd@x1 Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-09-11Merge branches 'acpi-video', 'acpi-resource', 'acpi-pad' and 'acpi-misc'Rafael J. Wysocki
Merge ACPI backlight (video) driver update, ACPI resource management updates, an ACPI processor aggregator device (PAD) driver fix, and miscellaneous ACPI updates for 6.12-rc1: - Add force_vendor quirk for Panasonic Toughbook CF-18 in the ACPI backlight driver (Hans de Goede). - Make the DMI checks related to backlight handling on Lenovo Yoga Tab 3 X90F less strict (Hans de Goede). - Enforce native backlight handling on Apple MacbookPro9,2 (Esther Shimanovich). - Add IRQ override quirks for Asus Vivobook Go E1404GAB and MECHREV GM7XG0M, and refine the TongFang GMxXGxx quirk (Li Chen, Tamim Khan, Werner Sembach). - Fix crash in exit_round_robin() in the ACPI processor aggregator device (PAD) driver (Seiji Nishikawa). - Define and use symbols for device and class name lengths in the ACPI bus type code and make the code use strscpy() instead of strcpy() in several places (Muhammad Qasim Abdul Majeed). * acpi-video: ACPI: video: Add force_vendor quirk for Panasonic Toughbook CF-18 ACPI: x86: Make Lenovo Yoga Tab 3 X90F DMI match less strict ACPI: video: Make Lenovo Yoga Tab 3 X90F DMI match less strict ACPI: video: force native for Apple MacbookPro9,2 * acpi-resource: ACPI: resource: Add another DMI match for the TongFang GMxXGxx ACPI: resource: Skip IRQ override on Asus Vivobook Go E1404GAB ACPI: resource: Do IRQ override on MECHREV GM7XG0M * acpi-pad: ACPI: PAD: fix crash in exit_round_robin() * acpi-misc: ACPI: button: Use strscpy() instead of strcpy() ACPI: bus: Define and use symbols for device and class name lengths ACPI: battery : Use strscpy() instead of strcpy() ACPI: acpi_processor: Use strscpy instead() of strcpy() ACPI: PAD: Use strscpy() instead of strcpy() ACPI: AC: Use strscpy() instead of strcpy()
2024-09-11io_uring/rsrc: add reference count to struct io_mapped_ubufJens Axboe
Currently there's a single ring owner of a mapped buffer, and hence the reference count will always be 1 when it's torn down and freed. However, in preparation for being able to link io_mapped_ubuf to different spots, add a reference count to manage the lifetime of it. Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11io_uring/rsrc: clear 'slot' entry upfrontJens Axboe
No functional changes in this patch, but clearing the slot pointer earlier will be required by a later change. Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11Merge branches 'acpi-battery', 'acpi-pmic', 'acpi-cppc' and 'acpi-processor'Rafael J. Wysocki
Merge ACPI battery driver, ACPI PMIC driver, ACPI processor driver and ACPI CPPC library updates for 6.12-rc1: - Use the driver core for the async probing management in the ACPI battery driver (Thomas Weißschuh). - Remove redundant initalizations of a local variable to NULL from the ACPI battery driver (Ilpo Järvinen). - Use strscpy() instead of strcpy() in the ACPI battery driver (Muhammad Qasim Abdul Majeed). - Remove unneeded check in tps68470_pmic_opregion_probe() (Aleksandr Mishin). - Add support for setting the EPP register through the ACPI CPPC sysfs interface if it is in FFH (Mario Limonciello). - Fix MASK_VAL() usage in the ACPI CPPC library (Clément Léger). - Reduce the log level of a per-CPU message about idle states in the ACPI processor driver (Li RongQing). * acpi-battery: ACPI: battery: use driver core managed async probing ACPI: battery: Remove redundant NULL initalizations ACPI: battery: Use strscpy() instead of strcpy() * acpi-pmic: ACPI: PMIC: Remove unneeded check in tps68470_pmic_opregion_probe() * acpi-cppc: ACPI: CPPC: Add support for setting EPP register in FFH ACPI: CPPC: Fix MASK_VAL() usage * acpi-processor: ACPI: processor: Reduce the log level of a per-CPU message about idle states
2024-09-11Merge branches 'acpi-ec', 'acpi-sysfs', 'acpi-utils' and 'acpi-soc'Rafael J. Wysocki
Merge an ACPI EC driver update, ACPI sysfs interface updates, an ACPI library function update, and an ACPI APD driver update for 6.12-rc1: - Do not release locks during operation region accesses in the ACPI EC driver (Rafael Wysocki). - Fix up the _STR handling in the ACPI device object sysfs interface, make it represent the device object attributes as an attribute group and make it rely on driver core functionality for sysfs attrubute management (Thomas Weißschuh). - Extend error messages printed to the kernel log when acpi_evaluate_dsm() fails to include revision and function number (David Wang). - Add a new AMDI0015 platform device ID to the ACPi APD driver for AMD SoCs (Shyam Sundar S K). * acpi-ec: ACPI: EC: Do not release locks during operation region accesses * acpi-sysfs: ACPI: sysfs: remove return value of acpi_device_setup_files() ACPI: sysfs: manage sysfs attributes through device core ACPI: sysfs: manage attributes as attribute_group ACPI: sysfs: evaluate _STR on each sysfs access ACPI: sysfs: validate return type of _STR method * acpi-utils: ACPI: utils: Add rev/func to message when acpi_evaluate_dsm() fails * acpi-soc: ACPI: APD: Add AMDI0015 as platform device
2024-09-11Merge branch 'acpi-riscv'Rafael J. Wysocki
Merge ACPI and irqchip updates related to external interrupt controller support on RISC-V: - Add ACPI device enumeration support for interrupt controller probing including taking dependencies into account (Sunil V L). - Implement ACPI-based interrupt controller probing on RISC-V (Sunil V L). - Add ACPI support for AIA in riscv-intc and add ACPI support to riscv-imsic, riscv-aplic, and sifive-plic (Sunil V L). * acpi-riscv: irqchip/sifive-plic: Add ACPI support irqchip/riscv-aplic: Add ACPI support irqchip/riscv-imsic: Add ACPI support irqchip/riscv-imsic-state: Create separate function for DT irqchip/riscv-intc: Add ACPI support for AIA ACPI: RISC-V: Implement function to add implicit dependencies ACPI: RISC-V: Initialize GSI mapping structures ACPI: RISC-V: Implement function to reorder irqchip probe entries ACPI: RISC-V: Implement PCI related functionality ACPI: pci_link: Clear the dependencies after probe ACPI: bus: Add RINTC IRQ model for RISC-V ACPI: scan: Define weak function to populate dependencies ACPI: scan: Add RISC-V interrupt controllers to honor list ACPI: scan: Refactor dependency creation ACPI: bus: Add acpi_riscv_init() function ACPI: scan: Add a weak arch_sort_irqchip_probe() to order the IRQCHIP probe arm64: PCI: Migrate ACPI related functions to pci-acpi.c
2024-09-11Merge branch 'acpica'Rafael J. Wysocki
Merge ACPICA updates for 6.12-rc1: - Check return value in acpi_db_convert_to_package() (Pei Xiao). - Detect FACS and allow setting the waking vector on reduced-hardware ACPI platforms (Jiaqing Zhao). - Allow ACPICA to represent semaphores as integers (Adrien Destugues). - Complete CXL 3.0 CXIMS structures support in ACPICA (Zhang Rui). - Make ACPICA support SPCR version 4 and add RISC-V SBI Subtype to DBG2 (Sia Jee Heng). - Implement the Dword_PCC Resource Descriptor Macro in ACPICA (Jose Marinho). - Correct the typo in struct acpi_mpam_msc_node member (Punit Agrawal). - Implement ACPI_WARNING_ONCE() and ACPI_ERROR_ONCE() and use them to prevent a Stall() violation warning from being printed every time this takes place (Vasily Khoruzhick). - Allow PCC Data Type in MCTP resource (Adam Young). - Fix memory leaks on acpi_ps_get_next_namepath() and acpi_ps_get_next_field() failures (Armin Wolf). - Add support for supressing leading zeros in hex strings when converting them to integers and update integer-to-hex-string conversions in ACPICA (Armin Wolf). - Add support for Windows 11 22H2 _OSI string (Armin Wolf). - Avoid warning for Dump Functions in ACPICA (Adam Lackorzynski). - Add extended linear address mode to HMAT MSCIS in ACPICA (Dave Jiang). - Handle empty connection_node in iasl (Aleksandrs Vinarskis). - Allow for more flexibility in _DSM args (Saket Dumbre). - Setup for ACPICA release 20240827 (Saket Dumbre). * acpica: (23 commits) ACPICA: Setup for ACPICA release 20240827 ACPICA: Allow for more flexibility in _DSM args ACPICA: iasl: handle empty connection_node ACPICA: HMAT: Add extended linear address mode to MSCIS ACPICA: Avoid warning for Dump Functions ACPICA: Add support for Windows 11 22H2 _OSI string ACPICA: Update integer-to-hex-string conversions ACPICA: Add support for supressing leading zeros in hex strings ACPICA: Allow for supressing leading zeros when using acpi_ex_convert_to_ascii() ACPICA: Fix memory leak if acpi_ps_get_next_field() fails ACPICA: Fix memory leak if acpi_ps_get_next_namepath() fails ACPICA: Allow PCC Data Type in MCTP resource. ACPICA: executer/exsystem: Don't nag user about every Stall() violating the spec ACPICA: Implement ACPI_WARNING_ONCE and ACPI_ERROR_ONCE ACPICA: MPAM: Correct the typo in struct acpi_mpam_msc_node member ACPICA: Implement the Dword_PCC Resource Descriptor Macro ACPICA: Headers: Add RISC-V SBI Subtype to DBG2 ACPICA: SPCR: Update the SPCR table to version 4 ACPICA: Complete CXL 3.0 CXIMS structures ACPICA: haiku: Fix invalid value used for semaphores ...
2024-09-11fbdev: omapfb: Fix typo in commentAndrew Kreimer
Reported-by: Matthew Wilcox <willy@infradead.org> Signed-off-by: Andrew Kreimer <algonell@gmail.com> Signed-off-by: Helge Deller <deller@gmx.de>
2024-09-11KVM: arm64: Get rid of REG_HIDDEN_USER visibility qualifierMarc Zyngier
Now that REG_HIDDEN_USER has no direct user anymore, remove it entirely and update all users of sysreg_hidden_user() to call sysreg_hidden() instead. Reviewed-by: Oliver Upton <oliver.upton@linux.dev> Link: https://lore.kernel.org/r/20240904082419.1982402-4-maz@kernel.org Signed-off-by: Marc Zyngier <maz@kernel.org>
2024-09-11KVM: arm64: Simplify visibility handling of AArch32 SPSR_*Marc Zyngier
Since SPSR_* are not associated with any register in the sysreg array, nor do they have .get_user()/.set_user() helpers, they are invisible to userspace with that encoding. Therefore hidden_user_visibility() serves no purpose here, and can be safely removed. Reviewed-by: Oliver Upton <oliver.upton@linux.dev> Link: https://lore.kernel.org/r/20240904082419.1982402-3-maz@kernel.org Signed-off-by: Marc Zyngier <maz@kernel.org>
2024-09-11KVM: arm64: Simplify handling of CNTKCTL_EL12Marc Zyngier
We go trough a great deal of effort to map CNTKCTL_EL12 to CNTKCTL_EL1 while hidding this mapping from userspace via a special visibility helper. However, it would be far simpler to just provide an accessor doing the mapping job, removing the need for a visibility helper. With that done, we can also remove the EL12_REG() macro which serves no purpose. Reviewed-by: Oliver Upton <oliver.upton@linux.dev> Link: https://lore.kernel.org/r/20240904082419.1982402-2-maz@kernel.org Signed-off-by: Marc Zyngier <maz@kernel.org>
2024-09-11Merge branch 'tip/sched/core' into sched_ext/for-6.12Tejun Heo
Pull in tip/sched/core to resolve two merge conflicts: - 96fd6c65efc6 ("sched: Factor out update_other_load_avgs() from __update_blocked_others()") 5d871a63997f ("sched/fair: Move effective_cpu_util() and effective_cpu_util() in fair.c") A simple context conflict. The former added __update_blocked_others() in the same #ifdef CONFIG_SMP block that effective_cpu_util() and sched_cpu_util() are in and the latter moved those functions to fair.c. This makes __update_blocked_others() more out of place. Will follow up with a patch to relocate. - 96fd6c65efc6 ("sched: Factor out update_other_load_avgs() from __update_blocked_others()") 84d265281d6c ("sched/pelt: Use rq_clock_task() for hw_pressure") The former factored out the body of __update_blocked_others() into update_other_load_avgs(). The latter changed how update_hw_load_avg() is called in the body. Resolved by applying the change to update_other_load_avgs() instead. Signed-off-by: Tejun Heo <tj@kernel.org>
2024-09-11Merge tag 'arm-fixes-6.11-3' of ↵Linus Torvalds
git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc Pull ARM SoC fixes from Arnd Bergmann: "The bulk of the changes this time are for device tree files in the rockchips platform, addressing correctness issues on individual boards, plus one change in the rk356x SoC file to make it match the binding. The only other changes that came in are - a CPU frequencey scaling fix for JH7110 (RISC-V) - a build fix for the cznic hwrandom driver - a fix for a deadlock in qualcomm uefi secure application firmware driver" * tag 'arm-fixes-6.11-3' of git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc: platform: cznic: turris-omnia-mcu: fix HW_RANDOM dependency riscv: dts: starfive: jh7110-common: Fix lower rate of CPUfreq by setting PLL0 rate to 1.5GHz firmware: qcom: uefisecapp: Fix deadlock in qcuefi_acquire() arm64: dts: rockchip: Fix compatibles for RK3588 VO{0,1}_GRF dt-bindings: soc: rockchip: Fix compatibles for RK3588 VO{0,1}_GRF arm64: dts: rockchip: override BIOS_DISABLE signal via GPIO hog on RK3399 Puma arm64: dts: rockchip: fix eMMC/SPI corruption when audio has been used on RK3399 Puma arm64: dts: rockchip: fix PMIC interrupt pin in pinctrl for ROCK Pi E arm64: dts: rockchip: Remove broken tsadc pinctrl binding for rk356x
2024-09-11Merge tag 'for-6.11/dm-fixes-2' of ↵Linus Torvalds
git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm Pull device mapper fix from Mikulas Patocka: - fix a race condition in dm-integrity * tag 'for-6.11/dm-fixes-2' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: dm-integrity: fix a race condition when accessing recalc_sector
2024-09-11Merge tag 'printk-for-6.11-fixup' of ↵Linus Torvalds
git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux Pull printk fix from Petr Mladek: - Fix build of serial_core as a module * tag 'printk-for-6.11-fixup' of git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux: printk: Export match_devname_and_update_preferred_console()
2024-09-11kernel/workqueue.c: fix DEFINE_PER_CPU_SHARED_ALIGNED expansionBaoquan He
Make tags always produces below annoying warnings: ctags: Warning: kernel/workqueue.c:470: null expansion of name pattern "\1" ctags: Warning: kernel/workqueue.c:474: null expansion of name pattern "\1" ctags: Warning: kernel/workqueue.c:478: null expansion of name pattern "\1" In commit 25528213fe9f ("tags: Fix DEFINE_PER_CPU expansions"), codes in places have been adjusted including cpu_worker_pools definition. I noticed in commit 4cb1ef64609f ("workqueue: Implement BH workqueues to eventually replace tasklets"), cpu_worker_pools definition was unfolded back. Not sure if it was intentionally done or ignored carelessly. Makes change to mute them specifically. Signed-off-by: Baoquan He <bhe@redhat.com> Signed-off-by: Tejun Heo <tj@kernel.org>
2024-09-11minmax: reduce min/max macro expansion in atomisp driverLorenzo Stoakes
Avoid unnecessary nested min()/max() which results in egregious macro expansion. Use clamp_t() as this introduces the least possible expansion, and turn the {s,u}DIGIT_FITTING() macros into inline functions to avoid the nested expansion. This resolves an issue with slackware 15.0 32-bit compilation as reported by Richard Narron. Presumably the min/max fixups would be difficult to backport, this patch should be easier and fix's Richard's problem in 5.15. Reported-by: Richard Narron <richard@aaazen.com> Reviewed-by: Hans de Goede <hdegoede@redhat.com> Closes: https://lore.kernel.org/all/4a5321bd-b1f-1832-f0c-cea8694dc5aa@aaazen.com/ Fixes: 867046cc7027 ("minmax: relax check to allow comparison between unsigned arguments and signed constants") Cc: stable@vger.kernel.org Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2024-09-11fbdev: pxafb: Fix possible use after free in pxafb_task()Kaixin Wang
In the pxafb_probe function, it calls the pxafb_init_fbinfo function, after which &fbi->task is associated with pxafb_task. Moreover, within this pxafb_init_fbinfo function, the pxafb_blank function within the &pxafb_ops struct is capable of scheduling work. If we remove the module which will call pxafb_remove to make cleanup, it will call unregister_framebuffer function which can call do_unregister_framebuffer to free fbi->fb through put_fb_info(fb_info), while the work mentioned above will be used. The sequence of operations that may lead to a UAF bug is as follows: CPU0 CPU1 | pxafb_task pxafb_remove | unregister_framebuffer(info) | do_unregister_framebuffer(fb_info) | put_fb_info(fb_info) | // free fbi->fb | set_ctrlr_state(fbi, state) | __pxafb_lcd_power(fbi, 0) | fbi->lcd_power(on, &fbi->fb.var) | //use fbi->fb Fix it by ensuring that the work is canceled before proceeding with the cleanup in pxafb_remove. Note that only root user can remove the driver at runtime. Signed-off-by: Kaixin Wang <kxwang23@m.fudan.edu.cn> Signed-off-by: Helge Deller <deller@gmx.de>
2024-09-11bpf: lsm: Set bpf_lsm_blob_sizes.lbs_task to 0Song Liu
bpf task local storage is now using task_struct->bpf_storage, so bpf_lsm_blob_sizes.lbs_task is no longer needed. Remove it to save some memory. Fixes: a10787e6d58c ("bpf: Enable task local storage for tracing programs") Cc: stable@vger.kernel.org Cc: KP Singh <kpsingh@kernel.org> Cc: Matt Bobrowski <mattbobrowski@google.com> Signed-off-by: Song Liu <song@kernel.org> Acked-by: Matt Bobrowski <mattbobrowski@google.com> Link: https://lore.kernel.org/r/20240911055508.9588-1-song@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-09-11selftests/bpf: Fix arena_atomics failure due to llvm changeYonghong Song
llvm change [1] made a change such that __sync_fetch_and_{and,or,xor}() will generate atomic_fetch_*() insns even if the return value is not used. This is a deliberate choice to make sure barrier semantics are preserved from source code to asm insn. But the change in [1] caused arena_atomics selftest failure. test_arena_atomics:PASS:arena atomics skeleton open 0 nsec libbpf: prog 'and': BPF program load failed: Permission denied libbpf: prog 'and': -- BEGIN PROG LOAD LOG -- arg#0 reference type('UNKNOWN ') size cannot be determined: -22 0: R1=ctx() R10=fp0 ; if (pid != (bpf_get_current_pid_tgid() >> 32)) @ arena_atomics.c:87 0: (18) r1 = 0xffffc90000064000 ; R1_w=map_value(map=arena_at.bss,ks=4,vs=4) 2: (61) r6 = *(u32 *)(r1 +0) ; R1_w=map_value(map=arena_at.bss,ks=4,vs=4) R6_w=scalar(smin=0,smax=umax=0xffffffff,v ar_off=(0x0; 0xffffffff)) 3: (85) call bpf_get_current_pid_tgid#14 ; R0_w=scalar() 4: (77) r0 >>= 32 ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 5: (5d) if r0 != r6 goto pc+11 ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) R6_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0x) ; __sync_fetch_and_and(&and64_value, 0x011ull << 32); @ arena_atomics.c:91 6: (18) r1 = 0x100000000060 ; R1_w=scalar() 8: (bf) r1 = addr_space_cast(r1, 0, 1) ; R1_w=arena 9: (18) r2 = 0x1100000000 ; R2_w=0x1100000000 11: (db) r2 = atomic64_fetch_and((u64 *)(r1 +0), r2) BPF_ATOMIC stores into R1 arena is not allowed processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'and': failed to load: -13 libbpf: failed to load object 'arena_atomics' libbpf: failed to load BPF skeleton 'arena_atomics': -13 test_arena_atomics:FAIL:arena atomics skeleton load unexpected error: -13 (errno 13) #3 arena_atomics:FAIL The reason of the failure is due to [2] where atomic{64,}_fetch_{and,or,xor}() are not allowed by arena addresses. Version 2 of the patch fixed the issue by using inline asm ([3]). But further discussion suggested to find a way from source to generate locked insn which is more user friendly. So in not-merged llvm patch ([4]), if relax memory ordering is used and the return value is not used, locked insn could be generated. So with llvm patch [4] to compile the bpf selftest, the following code __c11_atomic_fetch_and(&and64_value, 0x011ull << 32, memory_order_relaxed); is able to generate locked insn, hence fixing the selftest failure. [1] https://github.com/llvm/llvm-project/pull/106494 [2] d503a04f8bc0 ("bpf: Add support for certain atomics in bpf_arena to x86 JIT") [3] https://lore.kernel.org/bpf/20240803025928.4184433-1-yonghong.song@linux.dev/ [4] https://github.com/llvm/llvm-project/pull/107343 Signed-off-by: Yonghong Song <yonghong.song@linux.dev> Link: https://lore.kernel.org/r/20240909223431.1666305-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-09-11Merge branches 'pm-sleep', 'pm-opp' and 'pm-tools'Rafael J. Wysocki
Merge updates related to system sleep, operating performance points (OPP) updates, and PM tooling updates for 6.12-rc1: - Remove unused stub for saveable_highmem_page() and remove deprecated macros from power management documentation (Andy Shevchenko). - Use ysfs_emit() and sysfs_emit_at() in "show" functions in the PM sysfs interface (Xueqin Luo). - Update the maintainers information for the operating-points-v2-ti-cpu DT binding (Dhruva Gole). - Drop unnecessary of_match_ptr() from ti-opp-supply (Rob Herring). - Update directory handling and installation process in the pm-graph Makefile and add .gitignore to ignore sleepgraph.py artifacts to pm-graph (Amit Vadhavana, Yo-Jung Lin). - Make cpupower display residency value in idle-info (Aboorva Devarajan). - Add missing powercap_set_enabled() stub function to cpupower (John B. Wyatt IV). - Add SWIG support to cpupower (John B. Wyatt IV). * pm-sleep: PM: hibernate: Remove unused stub for saveable_highmem_page() Documentation: PM: Discourage use of deprecated macros PM: sleep: Use sysfs_emit() and sysfs_emit_at() in "show" functions PM: hibernate: Use sysfs_emit() and sysfs_emit_at() in "show" functions * pm-opp: dt-bindings: opp: operating-points-v2-ti-cpu: Update maintainers opp: ti: Drop unnecessary of_match_ptr() * pm-tools: pm:cpupower: Add error warning when SWIG is not installed MAINTAINERS: Add Maintainers for SWIG Python bindings pm:cpupower: Include test_raw_pylibcpupower.py pm:cpupower: Add SWIG bindings files for libcpupower pm:cpupower: Add missing powercap_set_enabled() stub function pm-graph: Update directory handling and installation process in Makefile pm-graph: Make git ignore sleepgraph.py artifacts tools/cpupower: display residency value in idle-info
2024-09-11Merge branch 'harden-and-extend-elf-build-id-parsing-logic'Alexei Starovoitov
Andrii Nakryiko says: ==================== Harden and extend ELF build ID parsing logic The goal of this patch set is to extend existing ELF build ID parsing logic, currently mostly used by BPF subsystem, with support for working in sleepable mode in which memory faults are allowed and can be relied upon to fetch relevant parts of ELF file to find and fetch .note.gnu.build-id information. This is useful and important for BPF subsystem itself, but also for PROCMAP_QUERY ioctl(), built atop of /proc/<pid>/maps functionality (see [0]), which makes use of the same build_id_parse() functionality. PROCMAP_QUERY is always called from sleepable user process context, so it doesn't have to suffer from current restrictions of build_id_parse() which are due to the NMI context assumption. Along the way, we harden the logic to avoid TOCTOU, overflow, out-of-bounds access problems. This is the very first patch, which can be backported to older releases, if necessary. We also lift existing limitations of only working as long as ELF program headers and build ID note section is contained strictly within the very first page of ELF file. We achieve all of the above without duplication of logic between sleepable and non-sleepable modes through freader abstraction that manages underlying folio from page cache (on demand) and gives a simple to use direct memory access interface. With that, single page restrictions and adding sleepable mode support is rather straightforward. We also extend existing set of BPF selftests with a few tests targeting build ID logic across sleepable and non-sleepabe contexts (we utilize sleepable and non-sleepable uprobes for that). [0] https://lore.kernel.org/linux-mm/20240627170900.1672542-4-andrii@kernel.org/ v6->v7: - added filemap_invalidate_{lock,unlock}_shared() around read_cache_folio and kept Eduard's Reviewed-by (Eduard); v5->v6: - use local phnum variable in get_build_id_32() (Jann); - switch memcmp() instead of strcmp() in parse_build_id() (Jann); v4->v5: - pass proper file reference to read_cache_folio() (Shakeel); - fix another potential overflow due to two u32 additions (Andi); - add PageUptodate() check to patch #1 (Jann); v3->v4: - fix few more potential overflow and out-of-bounds access issues (Andi); - use purely folio-based implementation for freader (Matthew); v2->v3: - remove unneeded READ_ONCE()s and force phoff to u64 for 32-bit mode (Andi); - moved hardening fixes to the front for easier backporting (Jann); - call freader_cleanup() from build_id_parse_buf() for consistency (Jiri); v1->v2: - ensure MADV_PAGEOUT works reliably by paging data in first (Shakeel); - to fix BPF CI build optionally define MADV_POPULATE_READ in selftest. ==================== Link: https://lore.kernel.org/r/20240829174232.3133883-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-09-11selftests/bpf: add build ID testsAndrii Nakryiko
Add a new set of tests validating behavior of capturing stack traces with build ID. We extend uprobe_multi target binary with ability to trigger uprobe (so that we can capture stack traces from it), but also we allow to force build ID data to be either resident or non-resident in memory (see also a comment about quirks of MADV_PAGEOUT). That way we can validate that in non-sleepable context we won't get build ID (as expected), but with sleepable uprobes we will get that build ID regardless of it being physically present in memory. Also, we add a small add-on linker script which reorders .note.gnu.build-id section and puts it after (big) .text section, putting build ID data outside of the very first page of ELF file. This will test all the relaxations we did in build ID parsing logic in kernel thanks to freader abstraction. Reviewed-by: Eduard Zingerman <eddyz87@gmail.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/r/20240829174232.3133883-11-andrii@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-09-11bpf: wire up sleepable bpf_get_stack() and bpf_get_task_stack() helpersAndrii Nakryiko
Add sleepable implementations of bpf_get_stack() and bpf_get_task_stack() helpers and allow them to be used from sleepable BPF program (e.g., sleepable uprobes). Note, the stack trace IPs capturing itself is not sleepable (that would need to be a separate project), only build ID fetching is sleepable and thus more reliable, as it will wait for data to be paged in, if necessary. For that we make use of sleepable build_id_parse() implementation. Now that build ID related internals in kernel/bpf/stackmap.c can be used both in sleepable and non-sleepable contexts, we need to add additional rcu_read_lock()/rcu_read_unlock() protection around fetching perf_callchain_entry, but with the refactoring in previous commit it's now pretty straightforward. We make sure to do rcu_read_unlock (in sleepable mode only) right before stack_map_get_build_id_offset() call which can sleep. By that time we don't have any more use of perf_callchain_entry. Note, bpf_get_task_stack() will fail for user mode if task != current. And for kernel mode build ID are irrelevant. So in that sense adding sleepable bpf_get_task_stack() implementation is a no-op. It feel right to wire this up for symmetry and completeness, but I'm open to just dropping it until we support `user && crosstask` condition. Reviewed-by: Eduard Zingerman <eddyz87@gmail.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/r/20240829174232.3133883-10-andrii@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-09-11bpf: decouple stack_map_get_build_id_offset() from perf_callchain_entryAndrii Nakryiko
Change stack_map_get_build_id_offset() which is used to convert stack trace IP addresses into build ID+offset pairs. Right now this function accepts an array of u64s as an input, and uses array of struct bpf_stack_build_id as an output. This is problematic because u64 array is coming from perf_callchain_entry, which is (non-sleepable) RCU protected, so once we allows sleepable build ID fetching, this all breaks down. But its actually pretty easy to make stack_map_get_build_id_offset() works with array of struct bpf_stack_build_id as both input and output. Which is what this patch is doing, eliminating the dependency on perf_callchain_entry. We require caller to fill out bpf_stack_build_id.ip fields (all other can be left uninitialized), and update in place as we do build ID resolution. We make sure to READ_ONCE() and cache locally current IP value as we used it in a few places to find matching VMA and so on. Given this data is directly accessible and modifiable by user's BPF code, we should make sure to have a consistent view of it. Reviewed-by: Eduard Zingerman <eddyz87@gmail.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/r/20240829174232.3133883-9-andrii@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-09-11lib/buildid: don't limit .note.gnu.build-id to the first page in ELFAndrii Nakryiko
With freader we don't need to restrict ourselves to a single page, so let's allow ELF notes to be at any valid position with the file. We also merge parse_build_id() and parse_build_id_buf() as now the only difference between them is note offset overflow, which makes sense to check in all situations. Reviewed-by: Eduard Zingerman <eddyz87@gmail.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/r/20240829174232.3133883-8-andrii@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-09-11lib/buildid: implement sleepable build_id_parse() APIAndrii Nakryiko
Extend freader with a flag specifying whether it's OK to cause page fault to fetch file data that is not already physically present in memory. With this, it's now easy to wait for data if the caller is running in sleepable (faultable) context. We utilize read_cache_folio() to bring the desired folio into page cache, after which the rest of the logic works just the same at folio level. Suggested-by: Omar Sandoval <osandov@fb.com> Cc: Shakeel Butt <shakeel.butt@linux.dev> Cc: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Eduard Zingerman <eddyz87@gmail.com> Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/r/20240829174232.3133883-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-09-11lib/buildid: rename build_id_parse() into build_id_parse_nofault()Andrii Nakryiko
Make it clear that build_id_parse() assumes that it can take no page fault by renaming it and current few users to build_id_parse_nofault(). Also add build_id_parse() stub which for now falls back to non-sleepable implementation, but will be changed in subsequent patches to take advantage of sleepable context. PROCMAP_QUERY ioctl() on /proc/<pid>/maps file is using build_id_parse() and will automatically take advantage of more reliable sleepable context implementation. Reviewed-by: Eduard Zingerman <eddyz87@gmail.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/r/20240829174232.3133883-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-09-11lib/buildid: remove single-page limit for PHDR searchAndrii Nakryiko
Now that freader allows to access multiple pages transparently, there is no need to limit program headers to the very first ELF file page. Remove this limitation, but still put some sane limit on amount of program headers that we are willing to iterate over (set arbitrarily to 256). Reviewed-by: Eduard Zingerman <eddyz87@gmail.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/r/20240829174232.3133883-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-09-11lib/buildid: take into account e_phoff when fetching program headersAndrii Nakryiko
Current code assumption is that program (segment) headers are following ELF header immediately. This is a common case, but is not guaranteed. So take into account e_phoff field of the ELF header when accessing program headers. Reviewed-by: Eduard Zingerman <eddyz87@gmail.com> Reported-by: Alexey Dobriyan <adobriyan@gmail.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/r/20240829174232.3133883-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-09-11lib/buildid: add single folio-based file reader abstractionAndrii Nakryiko
Add freader abstraction that transparently manages fetching and local mapping of the underlying file page(s) and provides a simple direct data access interface. freader_fetch() is the only and single interface necessary. It accepts file offset and desired number of bytes that should be accessed, and will return a kernel mapped pointer that caller can use to dereference data up to requested size. Requested size can't be bigger than the size of the extra buffer provided during initialization (because, worst case, all requested data has to be copied into it, so it's better to flag wrongly sized buffer unconditionally, regardless if requested data range is crossing page boundaries or not). If folio is not paged in, or some of the conditions are not satisfied, NULL is returned and more detailed error code can be accessed through freader->err field. This approach makes the usage of freader_fetch() cleaner. To accommodate accessing file data that crosses folio boundaries, user has to provide an extra buffer that will be used to make a local copy, if necessary. This is done to maintain a simple linear pointer data access interface. We switch existing build ID parsing logic to it, without changing or lifting any of the existing constraints, yet. This will be done separately. Given existing code was written with the assumption that it's always working with a single (first) page of the underlying ELF file, logic passes direct pointers around, which doesn't really work well with freader approach and would be limiting when removing the single page (folio) limitation. So we adjust all the logic to work in terms of file offsets. There is also a memory buffer-based version (freader_init_from_mem()) for cases when desired data is already available in kernel memory. This is used for parsing vmlinux's own build ID note. In this mode assumption is that provided data starts at "file offset" zero, which works great when parsing ELF notes sections, as all the parsing logic is relative to note section's start. Reviewed-by: Eduard Zingerman <eddyz87@gmail.com> Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/r/20240829174232.3133883-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-09-11lib/buildid: harden build ID parsing logicAndrii Nakryiko
Harden build ID parsing logic, adding explicit READ_ONCE() where it's important to have a consistent value read and validated just once. Also, as pointed out by Andi Kleen, we need to make sure that entire ELF note is within a page bounds, so move the overflow check up and add an extra note_size boundaries validation. Fixes tag below points to the code that moved this code into lib/buildid.c, and then subsequently was used in perf subsystem, making this code exposed to perf_event_open() users in v5.12+. Cc: stable@vger.kernel.org Reviewed-by: Eduard Zingerman <eddyz87@gmail.com> Reviewed-by: Jann Horn <jannh@google.com> Suggested-by: Andi Kleen <ak@linux.intel.com> Fixes: bd7525dacd7e ("bpf: Move stack_map_get_build_id into lib") Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/r/20240829174232.3133883-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-09-11Merge branches 'pm-cpuidle' and 'pm-powercap'Rafael J. Wysocki
Merge cpuidle updates and power capping updates for 6.12-rc1: - Add Granite Rapids Xeon support to intel_idle (Artem Bityutskiy). - Disable promotion to C1E on Jasper Lake and Elkhart Lake in intel_idle (Kai-Heng Feng). - Use scoped device node handling to fix missing of_node_put() and simplify walking OF children in the riscv-sbi cpuidle driver (Krzysztof Kozlowski). - Remove dead code from cpuidle_enter_state() (Dhruva Gole). - Change an error pointer to NULL to fix error handling in the intel_rapl power capping driver (Dan Carpenter). - Fix off by one in get_rpi() in the intel_rapl power capping driver (Dan Carpenter). - Add support for ArrowLake-U to the intel_rapl power capping driver (Sumeet Pawnikar). - Fix the energy-pkg event for AMD CPUs in the intel_rapl power capping driver (Dhananjay Ugwekar). - Add support for AMD family 1Ah processors to the intel_rapl power capping driver (Dhananjay Ugwekar). * pm-cpuidle: cpuidle: remove dead code from cpuidle_enter_state() cpuidle: riscv-sbi: Simplify with scoped for each OF child loop cpuidle: riscv-sbi: Use scoped device node handling to fix missing of_node_put intel_idle: Disable promotion to C1E on Jasper Lake and Elkhart Lake intel_idle: add Granite Rapids Xeon support * pm-powercap: powercap: intel_rapl: Change an error pointer to NULL powercap: intel_rapl: Fix off by one in get_rpi() powercap: intel_rapl: Add support for ArrowLake-U platform powercap/intel_rapl: Fix the energy-pkg event for AMD CPUs powercap/intel_rapl: Add support for AMD family 1Ah
2024-09-11block: implement async io_uring discard cmdPavel Begunkov
io_uring allows implementing custom file specific asynchronous operations via the fops->uring_cmd callback, a.k.a. IORING_OP_URING_CMD requests or just io_uring commands. Use it to add support for async discards. Normally, it first tries to queue up bios in a non-blocking context, and if that fails, we'd retry from a blocking context by returning -EAGAIN to the core io_uring. We always get the result from bios asynchronously by setting a custom bi_end_io callback, at which point we drag the request into the task context to either reissue or complete it and post a completion to the user. Unlike ioctl(BLKDISCARD) with stronger guarantees against races, we only do a best effort attempt to invalidate page cache, and it can race with any writes and reads and leave page cache stale. It's the same kind of races we allow to direct writes. Also, apart from cases where discarding is not allowed at all, e.g. discards are not supported or the file/device is read only, the user should assume that the sector range on disk is not valid anymore, even when an error was returned to the user. Suggested-by: Conrad Meyer <conradmeyer@meta.com> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/2b5210443e4fa0257934f73dfafcc18a77cd0e09.1726072086.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11block: introduce blk_validate_byte_range()Pavel Begunkov
In preparation to further changes extract a helper function out of blk_ioctl_discard() that validates if we can do IO against the given range of disk byte addresses. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/19a7779323c71e742a2f511e4cf49efcfd68cfd4.1726072086.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11filemap: introduce filemap_invalidate_pagesPavel Begunkov
kiocb_invalidate_pages() is useful for the write path, however not everything is backed by kiocb and we want to reuse the function for bio based discard implementation. Extract and and reuse a new helper called filemap_invalidate_pages(), which takes a argument indicating whether it should be non-blocking and might return -EAGAIN. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/f81374b52c92d0dce0f01a279d1eed42b54056aa.1726072086.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11io_uring/cmd: give inline space in request to cmdsPavel Begunkov
Some io_uring commands can use some inline space in io_kiocb. We have 32 bytes in struct io_uring_cmd, expose it. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/7ca779a61ee5e166e535d70df9c7f07b15d8a0ce.1726072086.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11io_uring/cmd: expose iowq to cmdsPavel Begunkov
When an io_uring request needs blocking context we offload it to the io_uring's thread pool called io-wq. We can get there off ->uring_cmd by returning -EAGAIN, but there is no straightforward way of doing that from an asynchronous callback. Add a helper that would transfer a command to a blocking context. Note, we do an extra hop via task_work before io_queue_iowq(), that's a limitation of io_uring infra we have that can likely be lifted later if that would ever become a problem. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/f735f807d7c8ba50c9452c69dfe5d3e9e535037b.1726072086.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-09-11Merge branch 'for-6.12/io_uring' into for-6.12/io_uring-discardJens Axboe
* for-6.12/io_uring: (31 commits) io_uring/io-wq: inherit cpuset of cgroup in io worker io_uring/io-wq: do not allow pinning outside of cpuset io_uring/rw: drop -EOPNOTSUPP check in __io_complete_rw_common() io_uring/rw: treat -EOPNOTSUPP for IOCB_NOWAIT like -EAGAIN io_uring/sqpoll: do not allow pinning outside of cpuset io_uring/eventfd: move refs to refcount_t io_uring: remove unused rsrc_put_fn io_uring: add new line after variable declaration io_uring: add GCOV_PROFILE_URING Kconfig option io_uring/kbuf: add support for incremental buffer consumption io_uring/kbuf: pass in 'len' argument for buffer commit Revert "io_uring: Require zeroed sqe->len on provided-buffers send" io_uring/kbuf: move io_ring_head_to_buf() to kbuf.h io_uring/kbuf: add io_kbuf_commit() helper io_uring/kbuf: shrink nr_iovs/mode in struct buf_sel_arg io_uring: wire up min batch wake timeout io_uring: add support for batch wait timeout io_uring: implement our own schedule timeout handling io_uring: move schedule wait logic into helper io_uring: encapsulate extraneous wait flags into a separate struct ...
2024-09-11Merge branch 'for-6.12/block' into for-6.12/io_uring-discardJens Axboe
* for-6.12/block: (115 commits) block: unpin user pages belonging to a folio at once mm: release number of pages of a folio block: introduce folio awareness and add a bigger size from folio block: Added folio-ized version of bio_add_hw_page() block, bfq: factor out a helper to split bfqq in bfq_init_rq() block, bfq: remove local variable 'bfqq_already_existing' in bfq_init_rq() block, bfq: remove local variable 'split' in bfq_init_rq() block, bfq: remove bfq_log_bfqg() block, bfq: merge bfq_release_process_ref() into bfq_put_cooperator() block, bfq: fix procress reference leakage for bfqq in merge chain block, bfq: fix uaf for accessing waker_bfqq after splitting blk-throttle: support prioritized processing of metadata blk-throttle: remove last_low_overflow_time drbd: Add NULL check for net_conf to prevent dereference in state validation blk-mq: add missing unplug trace event mtip32xx: Remove redundant null pointer checks in mtip_hw_debugfs_init() md: Add new_level sysfs interface zram: Shrink zram_table_entry::flags. zram: Remove ZRAM_LOCK zram: Replace bit spinlocks with a spinlock_t. ...
2024-09-11Merge branch 'pm-cpufreq'Rafael J. Wysocki
Merge cpufreq updates for 6.12-rc1: - Remove LATENCY_MULTIPLIER from cpufreq (Qais Yousef). - Add support for Granite Rapids and Sierra Forest in OOB mode to the intel_pstate cpufreq driver (Srinivas Pandruvada). - Add basic support for CPU capacity scaling on x86 and make the intel_pstate driver set asymmetric CPU capacity on hybrid systems without SMT (Rafael Wysocki). - Add missing MODULE_DESCRIPTION() macros to the powerpc cpufreq driver (Jeff Johnson). - Several OF related cleanups in cpufreq drivers (Rob Herring). - Enable COMPILE_TEST for ARM drivers (Rob Herrring). - Introduce quirks for syscon failures and use socinfo to get revision for TI cpufreq driver (Dhruva Gole, Nishanth Menon). - Minor cleanups in amd-pstate driver (Anastasia Belova, Dhananjay Ugwekar). - Minor cleanups for loongson, cpufreq-dt and powernv cpufreq drivers (Danila Tikhonov, Huacai Chen, and Liu Jing). - Make amd-pstate validate return of any attempt to update EPP limits, which fixes the masking hardware problems (Mario Limonciello). - Move the calculation of the AMD boost numerator outside of amd-pstate, correcting acpi-cpufreq on systems with preferred cores (Mario Limonciello). - Harden preferred core detection in amd-pstate to avoid potential false positives (Mario Limonciello). - Add extra unit test coverage for mode state machine (Mario Limonciello). - Fix an "Uninitialized variables" issue in amd-pstste (Qianqiang Liu). * pm-cpufreq: (35 commits) cpufreq/amd-pstate-ut: Fix an "Uninitialized variables" issue cpufreq/amd-pstate-ut: Add test case for mode switches cpufreq/amd-pstate: Export symbols for changing modes amd-pstate: Add missing documentation for `amd_pstate_prefcore_ranking` cpufreq: amd-pstate: Add documentation for `amd_pstate_hw_prefcore` cpufreq: amd-pstate: Optimize amd_pstate_update_limits() cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into amd_get_boost_ratio_numerator() x86/amd: Detect preferred cores in amd_get_boost_ratio_numerator() x86/amd: Move amd_get_highest_perf() out of amd-pstate ACPI: CPPC: Adjust debug messages in amd_set_max_freq_ratio() to warn ACPI: CPPC: Drop check for non zero perf ratio x86/amd: Rename amd_get_highest_perf() to amd_get_boost_ratio_numerator() ACPI: CPPC: Adjust return code for inline functions in !CONFIG_ACPI_CPPC_LIB x86/amd: Move amd_get_highest_perf() from amd.c to cppc.c cpufreq/amd-pstate: Catch failures for amd_pstate_epp_update_limit() cpufreq: ti-cpufreq: Use socinfo to get revision in AM62 family cpufreq: Fix the cacography in powernv-cpufreq.c cpufreq: ti-cpufreq: Introduce quirks to handle syscon fails appropriately cpufreq: loongson3: Use raw_smp_processor_id() in do_service_request() cpufreq: amd-pstate: add check for cpufreq_cpu_get's return value ...
2024-09-11selftests/ring-buffer: Handle meta-page bigger than the systemVincent Donnefort
Handle the case where the meta-page content is bigger than the system page-size. This prepares the ground for extending features covered by the meta-page. Cc: Shuah Khan <skhan@linuxfoundation.org> Cc: linux-kselftest@vger.kernel.org Link: https://lore.kernel.org/20240910162335.2993310-3-vdonnefort@google.com Acked-by: Shuah Khan <skhan@linuxfoundation.org> Signed-off-by: Vincent Donnefort <vdonnefort@google.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-09-11selftests/ring-buffer: Verify the entire meta-page paddingVincent Donnefort
Improve the ring-buffer meta-page test coverage by checking for the entire padding region to be 0 instead of just looking at the first 4 bytes. Cc: linux-kselftest@vger.kernel.org Link: https://lore.kernel.org/20240910162335.2993310-2-vdonnefort@google.com Acked-by: Shuah Khan <skhan@linuxfoundation.org> Signed-off-by: Vincent Donnefort <vdonnefort@google.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-09-11Merge tag 'amd-pstate-v6.12-2024-09-11' of ↵Rafael J. Wysocki
ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/superm1/linux Merge the second round of amd-pstate changes for 6.12 from Mario Limonciello: "* Move the calculation of the AMD boost numerator outside of amd-pstate, correcting acpi-cpufreq on systems with preferred cores * Harden preferred core detection to avoid potential false positives * Add extra unit test coverage for mode state machine" * tag 'amd-pstate-v6.12-2024-09-11' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/superm1/linux: cpufreq/amd-pstate-ut: Fix an "Uninitialized variables" issue cpufreq/amd-pstate-ut: Add test case for mode switches cpufreq/amd-pstate: Export symbols for changing modes amd-pstate: Add missing documentation for `amd_pstate_prefcore_ranking` cpufreq: amd-pstate: Add documentation for `amd_pstate_hw_prefcore` cpufreq: amd-pstate: Optimize amd_pstate_update_limits() cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into amd_get_boost_ratio_numerator() x86/amd: Detect preferred cores in amd_get_boost_ratio_numerator() x86/amd: Move amd_get_highest_perf() out of amd-pstate ACPI: CPPC: Adjust debug messages in amd_set_max_freq_ratio() to warn ACPI: CPPC: Drop check for non zero perf ratio x86/amd: Rename amd_get_highest_perf() to amd_get_boost_ratio_numerator() ACPI: CPPC: Adjust return code for inline functions in !CONFIG_ACPI_CPPC_LIB x86/amd: Move amd_get_highest_perf() from amd.c to cppc.c
2024-09-11perf env: Find correct branch counter info on hybridKan Liang
No event is printed in the "Branch Counter" column on hybrid machines. For example, $ perf record -e "{cpu_core/branch-instructions/pp,cpu_core/branches/}:S" -j any,counter $ perf report --total-cycles # Branch counter abbr list: # cpu_core/branch-instructions/pp = A # cpu_core/branches/ = B # '-' No event occurs # '+' Event occurrences may be lost due to branch counter saturated # # Sampled Cycles% Sampled Cycles Avg Cycles% Avg Cycles Branch Counter # ............... .............. ........... .......... .............. 44.54% 727.1K 0.00% 1 |+ |+ | 36.31% 592.7K 0.00% 2 |+ |+ | 17.83% 291.1K 0.00% 1 |+ |+ | The branch counter information (br_cntr_width and br_cntr_nr) in the perf_env is retrieved from the CPU_PMU_CAPS. However, the CPU_PMU_CAPS is not available on hybrid machines. Without the width information, the number of occurrences of an event cannot be calculated. For a hybrid machine, the caps information should be retrieved from the PMU_CAPS, and stored in the perf_env->pmu_caps. Add a perf_env__find_br_cntr_info() to return the correct branch counter information from the corresponding fields. Committer notes: While testing I couldn't s ee those "Branch counter" columns enabled by pressing 'B' on the TUI, after reporting it to the list Kan explained the situation: <quote Kan Liang> For a hybrid client, the "Branch Counter" feature is only supported starting from the just released Lunar Lake. Perf falls back to only "ANY" on your Raptor Lake. The "The branch counter is not available" message is expected. Here is the 'perf evlist' result from my Lunar Lake machine, # perf evlist -v cpu_core/branch-instructions/pp: type: 4 (cpu_core), size: 136, config: 0xc4 (branch-instructions), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|READ|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID|GROUP|LOST, disabled: 1, freq: 1, enable_on_exec: 1, precise_ip: 2, sample_id_all: 1, exclude_guest: 1, branch_sample_type: ANY|COUNTERS # </quote> Fixes: 6f9d8d1de2c61288 ("perf script: Add branch counters") Reviewed-by: Ian Rogers <irogers@google.com> Signed-off-by: Kan Liang <kan.liang@linux.intel.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20240909184201.553519-1-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-09-11perf evlist: Print hint for groupKan Liang
An event group is a critical relationship. There is a -g option that can display the relationship. But it's hard for a user to know when should this option be applied. If there is an event group in the perf record, print a hint to suggest the user apply the -g to display the group information. With the patch, $ perf record -e "{cycles,instructions},instructions" sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.024 MB perf.data (4 samples) ] $ $ perf evlist cycles instructions instructions # Tip: use 'perf evlist -g' to show group information $ perf evlist -g {cycles,instructions} instructions $ Committer testing: So for a perf.data file _with_ a group: root@number:~# perf evlist -g {cpu_core/branch-instructions/pp,cpu_core/branches/} dummy:u root@number:~# perf evlist cpu_core/branch-instructions/pp cpu_core/branches/ dummy:u # Tip: use 'perf evlist -g' to show group information root@number:~# Then for something _without_ a group, no hint: root@number:~# perf record ls <SNIP> [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.035 MB perf.data (7 samples) ] root@number:~# perf evlist cpu_atom/cycles/P cpu_core/cycles/P dummy:u root@number:~# No suggestion, good. Suggested-by: Arnaldo Carvalho de Melo <acme@kernel.org> Reviewed-by: Ian Rogers <irogers@google.com> Signed-off-by: Kan Liang <kan.liang@linux.intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Closes: https://lore.kernel.org/lkml/ZttgvduaKsVn1r4p@x1/ Link: https://lore.kernel.org/r/20240908202847.176280-1-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-09-11tools: Drop nonsensical -O6Sam James
-O6 is very much not-a-thing. Really, this should've been dropped entirely in 49b3cd306e60b9d8 ("tools: Set the maximum optimization level according to the compiler being used") instead of just passing it for not-Clang. Just collapse it down to -O3, instead of "-O6 unless Clang, in which case -O3". GCC interprets > -O3 as -O3. It doesn't even interpret > -O3 as -Ofast, which is a good thing, given -Ofast has specific (non-)requirements for code built using it. So, this does nothing except look a bit daft. Remove the silliness and also save a few lines in the Makefiles accordingly. Reviewed-by: Ian Rogers <irogers@google.com> Reviewed-by: Jesper Juhl <jesperjuhl76@gmail.com> Signed-off-by: Sam James <sam@gentoo.org> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Bill Wendling <morbo@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Justin Stitt <justinstitt@google.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: llvm@lists.linux.dev Link: https://lore.kernel.org/r/4f01524fa4ea91c7146a41e26ceaf9dae4c127e4.1725821201.git.sam@gentoo.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-09-11Merge branch 'bpf: Allow skb dynptr for tp_btf'Martin KaFai Lau
Philo Lu says: ==================== This makes bpf_dynptr_from_skb usable for tp_btf, so that we can easily parse skb in tracepoints. This has been discussed in [0], and Martin suggested to use dynptr (instead of helpers like bpf_skb_load_bytes). For safety, skb dynptr shouldn't be used in fentry/fexit. This is achieved by add KF_TRUSTED_ARGS flag in bpf_dynptr_from_skb defination, because pointers passed by tracepoint are trusted (PTR_TRUSTED) while those of fentry/fexit are not. Another problem raises that NULL pointers could be passed to tracepoint, such as trace_tcp_send_reset, and we need to recognize them. This is done by add a "__nullable" suffix in the func_proto of the tracepoint, discussed in [1]. 2 Test cases are added, one for "__nullable" suffix, and the other for using skb dynptr in tp_btf. changelog v2 -> v3 (Andrii Nakryiko): Patch 1: - Remove prog type check in prog_arg_maybe_null() - Add bpf_put_raw_tracepoint() after get() - Use kallsyms_lookup() instead of sprintf("%ps") Patch 2: Add separate test "tp_btf_nullable", and use full failure msg v1 -> v2: - Add "__nullable" suffix support (Alexei Starovoitov) - Replace "struct __sk_buff*" with "void*" in test (Martin KaFai Lau) [0] https://lore.kernel.org/all/20240205121038.41344-1-lulie@linux.alibaba.com/T/ [1] https://lore.kernel.org/all/20240430121805.104618-1-lulie@linux.alibaba.com/T/ ==================== Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>