diff options
Diffstat (limited to 'init')
-rw-r--r-- | init/.kunitconfig | 3 | ||||
-rw-r--r-- | init/Kconfig | 323 | ||||
-rw-r--r-- | init/Makefile | 6 | ||||
-rwxr-xr-x | init/build-version | 10 | ||||
-rw-r--r-- | init/do_mounts.c | 6 | ||||
-rw-r--r-- | init/do_mounts.h | 9 | ||||
-rw-r--r-- | init/do_mounts_initrd.c | 5 | ||||
-rw-r--r-- | init/init_task.c | 21 | ||||
-rw-r--r-- | init/initramfs.c | 108 | ||||
-rw-r--r-- | init/initramfs_internal.h | 8 | ||||
-rw-r--r-- | init/initramfs_test.c | 407 | ||||
-rw-r--r-- | init/main.c | 138 |
12 files changed, 841 insertions, 203 deletions
diff --git a/init/.kunitconfig b/init/.kunitconfig new file mode 100644 index 000000000000..acb906b1a5f9 --- /dev/null +++ b/init/.kunitconfig @@ -0,0 +1,3 @@ +CONFIG_KUNIT=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_TEST=y diff --git a/init/Kconfig b/init/Kconfig index 8df18f3a9748..af4c2f085455 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -60,6 +60,13 @@ config LLD_VERSION default $(ld-version) if LD_IS_LLD default 0 +config RUSTC_VERSION + int + default $(rustc-version) + help + It does not depend on `RUST` since that one may need to use the version + in a `depends on`. + config RUST_IS_AVAILABLE def_bool $(success,$(srctree)/scripts/rust_is_available.sh) help @@ -71,18 +78,28 @@ config RUST_IS_AVAILABLE In particular, the Makefile target 'rustavailable' is useful to check why the Rust toolchain is not being detected. +config RUSTC_LLVM_VERSION + int + default $(rustc-llvm-version) + config CC_CAN_LINK bool default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m64-flag)) if 64BIT default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m32-flag)) -config CC_CAN_LINK_STATIC +# Fixed in GCC 14, 13.3, 12.4 and 11.5 +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113921 +config GCC_ASM_GOTO_OUTPUT_BROKEN bool - default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m64-flag) -static) if 64BIT - default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m32-flag) -static) + depends on CC_IS_GCC + default y if GCC_VERSION < 110500 + default y if GCC_VERSION >= 120000 && GCC_VERSION < 120400 + default y if GCC_VERSION >= 130000 && GCC_VERSION < 130300 config CC_HAS_ASM_GOTO_OUTPUT - def_bool $(success,echo 'int foo(int x) { asm goto ("": "=r"(x) ::: bar); return x; bar: return 0; }' | $(CC) -x c - -c -o /dev/null) + def_bool y + depends on !GCC_ASM_GOTO_OUTPUT_BROKEN + depends on $(success,echo 'int foo(int x) { asm goto ("": "=r"(x) ::: bar); return x; bar: return 0; }' | $(CC) -x c - -c -o /dev/null) config CC_HAS_ASM_GOTO_TIED_OUTPUT depends on CC_HAS_ASM_GOTO_OUTPUT @@ -98,6 +115,33 @@ config CC_HAS_ASM_INLINE config CC_HAS_NO_PROFILE_FN_ATTR def_bool $(success,echo '__attribute__((no_profile_instrument_function)) int x();' | $(CC) -x c - -c -o /dev/null -Werror) +config CC_HAS_COUNTED_BY + bool + # clang needs to be at least 19.1.3 to avoid __bdos miscalculations + # https://github.com/llvm/llvm-project/pull/110497 + # https://github.com/llvm/llvm-project/pull/112636 + default y if CC_IS_CLANG && CLANG_VERSION >= 190103 + # supported since gcc 15.1.0 + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896 + default y if CC_IS_GCC && GCC_VERSION >= 150100 + +config CC_HAS_MULTIDIMENSIONAL_NONSTRING + def_bool $(success,echo 'char tag[][4] __attribute__((__nonstring__)) = { };' | $(CC) $(CLANG_FLAGS) -x c - -c -o /dev/null -Werror) + +config LD_CAN_USE_KEEP_IN_OVERLAY + # ld.lld prior to 21.0.0 did not support KEEP within an overlay description + # https://github.com/llvm/llvm-project/pull/130661 + def_bool LD_IS_BFD || LLD_VERSION >= 210000 + +config RUSTC_HAS_COERCE_POINTEE + def_bool RUSTC_VERSION >= 108400 + +config RUSTC_HAS_SPAN_FILE + def_bool RUSTC_VERSION >= 108800 + +config RUSTC_HAS_UNNECESSARY_TRANSMUTES + def_bool RUSTC_VERSION >= 108800 + config PAHOLE_VERSION int default $(shell,$(srctree)/scripts/pahole-version.sh $(PAHOLE)) @@ -106,7 +150,7 @@ config CONSTRUCTORS bool config IRQ_WORK - bool + def_bool y if SMP config BUILDTIME_TABLE_SORT bool @@ -299,8 +343,9 @@ config KERNEL_XZ BCJ filters which can improve compression ratio of executable code. The size of the kernel is about 30% smaller with XZ in comparison to gzip. On architectures for which there is a BCJ - filter (i386, x86_64, ARM, IA-64, PowerPC, and SPARC), XZ - will create a few percent smaller kernel than plain LZMA. + filter (i386, x86_64, ARM, ARM64, RISC-V, big endian PowerPC, + and SPARC), XZ will create a few percent smaller kernel than + plain LZMA. The speed is about the same as with LZMA: The decompression speed of XZ is better than that of bzip2 but worse than gzip @@ -436,16 +481,6 @@ config CROSS_MEMORY_ATTACH to directly read from or write to another process' address space. See the man page for more details. -config USELIB - bool "uselib syscall (for libc5 and earlier)" - default ALPHA || M68K || SPARC - help - This option enables the uselib syscall, a system call used in the - dynamic linker from libc5 and earlier. glibc does not use this - system call. If you intend to run programs built on libc5 or - earlier, you may need to enable this syscall. Current systems - running glibc can safely disable this. - config AUDIT bool "Auditing support" depends on NET @@ -538,24 +573,24 @@ config HAVE_SCHED_AVG_IRQ depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING depends on SMP -config SCHED_THERMAL_PRESSURE +config SCHED_HW_PRESSURE bool default y if ARM && ARM_CPU_TOPOLOGY default y if ARM64 depends on SMP depends on CPU_FREQ_THERMAL help - Select this option to enable thermal pressure accounting in the - scheduler. Thermal pressure is the value conveyed to the scheduler + Select this option to enable HW pressure accounting in the + scheduler. HW pressure is the value conveyed to the scheduler that reflects the reduction in CPU compute capacity resulted from - thermal throttling. Thermal throttling occurs when the performance of - a CPU is capped due to high operating temperatures. + HW throttling. HW throttling occurs when the performance of + a CPU is capped due to high operating temperatures as an example. If selected, the scheduler will be able to balance tasks accordingly, i.e. put less load on throttled CPUs than on non/less throttled ones. This requires the architecture to implement - arch_update_thermal_pressure() and arch_scale_thermal_pressure(). + arch_update_hw_pressure() and arch_scale_thermal_pressure(). config BSD_PROCESS_ACCT bool "BSD Process Accounting" @@ -671,7 +706,7 @@ endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION bool "CPU isolation" - depends on SMP || COMPILE_TEST + depends on SMP default y help Make sure that CPUs running critical tasks are not disturbed by @@ -734,8 +769,8 @@ config LOG_CPU_MAX_BUF_SHIFT int "CPU kernel log buffer size contribution (13 => 8 KB, 17 => 128KB)" depends on SMP range 0 21 - default 12 if !BASE_SMALL default 0 if BASE_SMALL + default 12 depends on PRINTK help This option allows to increase the default ring buffer size @@ -867,14 +902,26 @@ config CC_IMPLICIT_FALLTHROUGH default "-Wimplicit-fallthrough=5" if CC_IS_GCC && $(cc-option,-Wimplicit-fallthrough=5) default "-Wimplicit-fallthrough" if CC_IS_CLANG && $(cc-option,-Wunreachable-code-fallthrough) -# Currently, disable gcc-11+ array-bounds globally. +# Currently, disable gcc-10+ array-bounds globally. # It's still broken in gcc-13, so no upper bound yet. -config GCC11_NO_ARRAY_BOUNDS +config GCC10_NO_ARRAY_BOUNDS def_bool y config CC_NO_ARRAY_BOUNDS bool - default y if CC_IS_GCC && GCC_VERSION >= 110000 && GCC11_NO_ARRAY_BOUNDS + default y if CC_IS_GCC && GCC_VERSION >= 90000 && GCC10_NO_ARRAY_BOUNDS + +# Currently, disable -Wstringop-overflow for GCC globally. +config GCC_NO_STRINGOP_OVERFLOW + def_bool y + +config CC_NO_STRINGOP_OVERFLOW + bool + default y if CC_IS_GCC && GCC_NO_STRINGOP_OVERFLOW + +config CC_STRINGOP_OVERFLOW + bool + default y if CC_IS_GCC && !CC_NO_STRINGOP_OVERFLOW # # For architectures that know their GCC __int128 support is sound @@ -908,6 +955,9 @@ config NUMA_BALANCING_DEFAULT_ENABLED If set, automatic NUMA balancing will be enabled if running on a NUMA machine. +config SLAB_OBJ_EXT + bool + menuconfig CGROUPS bool "Control Group support" select KERNFS @@ -941,14 +991,41 @@ config MEMCG bool "Memory controller" select PAGE_COUNTER select EVENTFD + select SLAB_OBJ_EXT help Provides control over the memory footprint of tasks in a cgroup. -config MEMCG_KMEM +config MEMCG_NMI_UNSAFE + bool + depends on MEMCG + depends on HAVE_NMI + depends on !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !ARCH_HAVE_NMI_SAFE_CMPXCHG + default y + +config MEMCG_NMI_SAFETY_REQUIRES_ATOMIC bool depends on MEMCG + depends on HAVE_NMI + depends on !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && ARCH_HAVE_NMI_SAFE_CMPXCHG default y +config MEMCG_V1 + bool "Legacy cgroup v1 memory controller" + depends on MEMCG + default n + help + Legacy cgroup v1 memory controller which has been deprecated by + cgroup v2 implementation. The v1 is there for legacy applications + which haven't migrated to the new cgroup v2 interface yet. If you + do not have any such application then you are completely fine leaving + this option disabled. + + Please note that feature set of the legacy memory controller is likely + going to shrink due to deprecation process. New deployments with v1 + controller are highly discouraged. + + Say N if unsure. + config BLK_CGROUP bool "IO controller" depends on BLOCK @@ -985,9 +1062,13 @@ menuconfig CGROUP_SCHED tasks. if CGROUP_SCHED +config GROUP_SCHED_WEIGHT + def_bool n + config FAIR_GROUP_SCHED bool "Group scheduling for SCHED_OTHER" depends on CGROUP_SCHED + select GROUP_SCHED_WEIGHT default CGROUP_SCHED config CFS_BANDWIDTH @@ -1012,6 +1093,23 @@ config RT_GROUP_SCHED realtime bandwidth for them. See Documentation/scheduler/sched-rt-group.rst for more information. +config RT_GROUP_SCHED_DEFAULT_DISABLED + bool "Require boot parameter to enable group scheduling for SCHED_RR/FIFO" + depends on RT_GROUP_SCHED + default n + help + When set, the RT group scheduling is disabled by default. The option + is in inverted form so that mere RT_GROUP_SCHED enables the group + scheduling. + + Say N if unsure. + +config EXT_GROUP_SCHED + bool + depends on SCHED_CLASS_EXT && CGROUP_SCHED + select GROUP_SCHED_WEIGHT + default y + endif #CGROUP_SCHED config SCHED_MM_CID @@ -1066,6 +1164,16 @@ config CGROUP_RDMA Attaching processes with active RDMA resources to the cgroup hierarchy is allowed even if can cross the hierarchy's limit. +config CGROUP_DMEM + bool "Device memory controller (DMEM)" + select PAGE_COUNTER + help + The DMEM controller allows compatible devices to restrict device + memory usage based on the cgroup hierarchy. + + As an example, it allows you to restrict VRAM usage for applications + in the DRM subsystem. + config CGROUP_FREEZER bool "Freezer controller" help @@ -1096,6 +1204,7 @@ config CGROUP_HUGETLB config CPUSETS bool "Cpuset controller" depends on SMP + select UNION_FIND help This option will let you create and manage CPUSETs which allow dynamically partitioning a system into sets of CPUs and @@ -1104,9 +1213,23 @@ config CPUSETS Say N if unsure. +config CPUSETS_V1 + bool "Legacy cgroup v1 cpusets controller" + depends on CPUSETS + default n + help + Legacy cgroup v1 cpusets controller which has been deprecated by + cgroup v2 implementation. The v1 is there for legacy applications + which haven't migrated to the new cgroup v2 interface yet. Legacy + interface includes cpuset filesystem and /proc/<pid>/cpuset. If you + do not have any such application then you are completely fine leaving + this option disabled. + + Say N if unsure. + config PROC_PID_CPUSET bool "Include legacy /proc/<pid>/cpuset file" - depends on CPUSETS + depends on CPUSETS_V1 default y config CGROUP_DEVICE @@ -1357,6 +1480,13 @@ config INITRAMFS_PRESERVE_MTIME If unsure, say Y. +config INITRAMFS_TEST + bool "Test initramfs cpio archive extraction" if !KUNIT_ALL_TESTS + depends on BLK_DEV_INITRD && KUNIT=y + default KUNIT_ALL_TESTS + help + Build KUnit tests for initramfs. See Documentation/dev-tools/kunit + choice prompt "Compiler optimization level" default CC_OPTIMIZE_FOR_PERFORMANCE @@ -1442,13 +1572,18 @@ config SYSCTL_ARCH_UNALIGN_ALLOW the unaligned access emulation. see arch/parisc/kernel/unaligned.c for reference -config HAVE_PCSPKR_PLATFORM - bool +config SYSFS_SYSCALL + bool "Sysfs syscall support" + default n + help + sys_sysfs is an obsolete system call no longer supported in libc. + Note that disabling this option is more secure but might break + compatibility with some systems. -# interpreter that classic socket filters depend on -config BPF + If unsure say N here. + +config HAVE_PCSPKR_PLATFORM bool - select CRYPTO_LIB_SHA1 menuconfig EXPERT bool "Configure standard kernel features (expert users)" @@ -1483,7 +1618,7 @@ config MULTIUSER config SGETMASK_SYSCALL bool "sgetmask/ssetmask syscalls support" if EXPERT - def_bool PARISC || M68K || PPC || MIPS || X86 || SPARC || MICROBLAZE || SUPERH + default PARISC || M68K || PPC || MIPS || X86 || SPARC || MICROBLAZE || SUPERH help sys_sgetmask and sys_ssetmask are obsolete system calls no longer supported in libc but still enabled by default in some @@ -1491,16 +1626,6 @@ config SGETMASK_SYSCALL If unsure, leave the default option here. -config SYSFS_SYSCALL - bool "Sysfs syscall support" if EXPERT - default y - help - sys_sysfs is an obsolete system call no longer supported in libc. - Note that disabling this option is more secure but might break - compatibility with some systems. - - If unsure say Y here. - config FHANDLE bool "open by fhandle syscalls" if EXPERT select EXPORTFS @@ -1569,11 +1694,10 @@ config PCSPKR_PLATFORM This option allows to disable the internal PC-Speaker support, saving some memory. -config BASE_FULL - default y - bool "Enable full-sized data structures for core" if EXPERT +config BASE_SMALL + bool "Enable smaller-sized data structures for core" if EXPERT help - Disabling this option reduces the size of miscellaneous core + Enabling this option reduces the size of miscellaneous core kernel data structures. This saves memory on small machines, but may reduce performance. @@ -1592,6 +1716,16 @@ config FUTEX_PI depends on FUTEX && RT_MUTEXES default y +config FUTEX_PRIVATE_HASH + bool + depends on FUTEX && !BASE_SMALL && MMU + default y + +config FUTEX_MPOL + bool + depends on FUTEX && NUMA + default y + config EPOLL bool "Enable eventpoll support" if EXPERT default y @@ -1654,6 +1788,19 @@ config IO_URING applications to submit and complete IO through submission and completion rings that are shared between the kernel and application. +config GCOV_PROFILE_URING + bool "Enable GCOV profiling on the io_uring subsystem" + depends on GCOV_KERNEL + help + Enable GCOV profiling on the io_uring subsystem, to facilitate + code coverage testing. + + If unsure, say N. + + Note that this will have a negative impact on the performance of + the io_uring subsystem, hence this should only be enabled for + specific test purposes. + config ADVISE_SYSCALLS bool "Enable madvise/fadvise syscalls" if EXPERT default y @@ -1765,36 +1912,35 @@ config KALLSYMS_ALL Say N unless you really need all symbols, or kernel live patching. -config KALLSYMS_ABSOLUTE_PERCPU +# end of the "standard kernel features (expert users)" menu + +config ARCH_HAS_MEMBARRIER_CALLBACKS bool - depends on KALLSYMS - default X86_64 && SMP -config KALLSYMS_BASE_RELATIVE +config ARCH_HAS_MEMBARRIER_SYNC_CORE + bool + +config ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS bool - depends on KALLSYMS - default y help - Instead of emitting them as absolute values in the native word size, - emit the symbol references in the kallsyms table as 32-bit entries, - each containing a relative value in the range [base, base + U32_MAX] - or, when KALLSYMS_ABSOLUTE_PERCPU is in effect, each containing either - an absolute value in the range [0, S32_MAX] or a relative value in the - range [base, base + S32_MAX], where base is the lowest relative symbol - address encountered in the image. + Control MSEAL_SYSTEM_MAPPINGS access based on architecture. - On 64-bit builds, this reduces the size of the address table by 50%, - but more importantly, it results in entries whose values are build - time constants, and no relocation pass is required at runtime to fix - up the entries based on the runtime load address of the kernel. + A 64-bit kernel is required for the memory sealing feature. + No specific hardware features from the CPU are needed. -# end of the "standard kernel features (expert users)" menu + To enable this feature, the architecture needs to update their + special mappings calls to include the sealing flag and confirm + that it doesn't unmap/remap system mappings during the life + time of the process. The existence of this flag for an architecture + implies that it does not require the remapping of the system + mappings during process lifetime, so sealing these mappings is safe + from a kernel perspective. -config ARCH_HAS_MEMBARRIER_CALLBACKS - bool + After the architecture enables this, a distribution can set + CONFIG_MSEAL_SYSTEM_MAPPING to manage access to the feature. -config ARCH_HAS_MEMBARRIER_SYNC_CORE - bool + For complete descriptions of memory sealing, please see + Documentation/userspace-api/mseal.rst config HAVE_PERF_EVENTS bool @@ -1883,11 +2029,16 @@ config RUST bool "Rust support" depends on HAVE_RUST depends on RUST_IS_AVAILABLE - depends on !MODVERSIONS - depends on !GCC_PLUGINS + select EXTENDED_MODVERSIONS if MODVERSIONS + depends on !MODVERSIONS || GENDWARFKSYMS + depends on !GCC_PLUGIN_RANDSTRUCT depends on !RANDSTRUCT - depends on !DEBUG_INFO_BTF || PAHOLE_HAS_LANG_EXCLUDE - select CONSTRUCTORS + depends on !DEBUG_INFO_BTF || (PAHOLE_HAS_LANG_EXCLUDE && !LTO) + depends on !CFI_CLANG || HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC + select CFI_ICALL_NORMALIZE_INTEGERS if CFI_CLANG + depends on !CALL_PADDING || RUSTC_VERSION >= 108100 + depends on !KASAN_SW_TAGS + depends on !(MITIGATION_RETHUNK && KASAN) || RUSTC_VERSION >= 108300 help Enables Rust support in the kernel. @@ -1904,12 +2055,19 @@ config RUST config RUSTC_VERSION_TEXT string depends on RUST - default $(shell,command -v $(RUSTC) >/dev/null 2>&1 && $(RUSTC) --version || echo n) + default "$(RUSTC_VERSION_TEXT)" + help + See `CC_VERSION_TEXT`. config BINDGEN_VERSION_TEXT string depends on RUST - default $(shell,command -v $(BINDGEN) >/dev/null 2>&1 && $(BINDGEN) --version || echo n) + # The dummy parameter `workaround-for-0.69.0` is required to support 0.69.0 + # (https://github.com/rust-lang/rust-bindgen/pull/2678) and 0.71.0 + # (https://github.com/rust-lang/rust-bindgen/pull/3040). It can be removed + # when the minimum version is upgraded past the latter (0.69.1 and 0.71.1 + # both fixed the issue). + default "$(shell,$(BINDGEN) --version workaround-for-0.69.0 2>/dev/null)" # # Place an empty function call at each tracepoint site. Can be @@ -1917,6 +2075,7 @@ config BINDGEN_VERSION_TEXT # config TRACEPOINTS bool + select TASKS_TRACE_RCU source "kernel/Kconfig.kexec" @@ -1928,11 +2087,6 @@ config RT_MUTEXES bool default y if PREEMPT_RT -config BASE_SMALL - int - default 0 if BASE_FULL - default 1 if !BASE_FULL - config MODULE_SIG_FORMAT def_bool n select SYSTEM_DATA_VERIFICATION @@ -1970,6 +2124,9 @@ source "kernel/Kconfig.locks" config ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE bool +config ARCH_HAS_PREPARE_SYNC_CORE_CMD + bool + config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE bool diff --git a/init/Makefile b/init/Makefile index cbac576c57d6..d6f75d8907e0 100644 --- a/init/Makefile +++ b/init/Makefile @@ -12,6 +12,7 @@ else obj-$(CONFIG_BLK_DEV_INITRD) += initramfs.o endif obj-$(CONFIG_GENERIC_CALIBRATE_DELAY) += calibrate.o +obj-$(CONFIG_INITRAMFS_TEST) += initramfs_test.o obj-y += init_task.o @@ -52,13 +53,10 @@ CFLAGS_version.o := -include $(obj)/utsversion-tmp.h # Build version-timestamp.c with final UTS_VERSION # -include/generated/utsversion.h: build-version-auto = $(shell $(srctree)/$(src)/build-version) +include/generated/utsversion.h: build-version-auto = $(shell $(srctree)/scripts/build-version) include/generated/utsversion.h: build-timestamp-auto = $(shell LC_ALL=C date) include/generated/utsversion.h: FORCE $(call filechk,uts_version) $(obj)/version-timestamp.o: include/generated/utsversion.h CFLAGS_version-timestamp.o := -include include/generated/utsversion.h -KASAN_SANITIZE_version-timestamp.o := n -KCSAN_SANITIZE_version-timestamp.o := n -GCOV_PROFILE_version-timestamp.o := n diff --git a/init/build-version b/init/build-version deleted file mode 100755 index 537d45815083..000000000000 --- a/init/build-version +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0-only - -prev_ver=$(cat .version 2>/dev/null) && -ver=$(expr ${prev_ver} + 1 2>/dev/null) || -ver=1 - -echo ${ver} > .version - -echo ${ver} diff --git a/init/do_mounts.c b/init/do_mounts.c index 279ad28bf4fb..6af29da8889e 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -159,8 +159,7 @@ static int __init do_mount_root(const char *name, const char *fs, if (!p) return -ENOMEM; data_page = page_address(p); - /* zero-pad. init_mount() will make sure it's terminated */ - strncpy(data_page, data, PAGE_SIZE); + strscpy_pad(data_page, data, PAGE_SIZE); } ret = init_mount(name, "/root", fs, flags, data_page); @@ -208,6 +207,9 @@ retry: goto out; case -EACCES: case -EINVAL: +#ifdef CONFIG_BLOCK + init_flush_fput(); +#endif continue; } /* diff --git a/init/do_mounts.h b/init/do_mounts.h index 15e372b00ce7..6069ea3eb80d 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -9,6 +9,8 @@ #include <linux/major.h> #include <linux/root_dev.h> #include <linux/init_syscalls.h> +#include <linux/task_work.h> +#include <linux/file.h> void mount_root_generic(char *name, char *pretty_name, int flags); void mount_root(char *root_device_name); @@ -41,3 +43,10 @@ static inline bool initrd_load(char *root_device_name) } #endif + +/* Ensure that async file closing finished to prevent spurious errors. */ +static inline void init_flush_fput(void) +{ + flush_delayed_fput(); + task_work_run(); +} diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index 425f4bcf4b77..f6867bad0d78 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -21,7 +21,7 @@ phys_addr_t phys_initrd_start __initdata; unsigned long phys_initrd_size __initdata; #ifdef CONFIG_SYSCTL -static struct ctl_table kern_do_mounts_initrd_table[] = { +static const struct ctl_table kern_do_mounts_initrd_table[] = { { .procname = "real-root-dev", .data = &real_root_dev, @@ -29,7 +29,6 @@ static struct ctl_table kern_do_mounts_initrd_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static __init int kernel_do_mounts_initrd_sysctls_init(void) @@ -90,7 +89,7 @@ static void __init handle_initrd(char *root_device_name) extern char *envp_init[]; int error; - pr_warn("using deprecated initrd support, will be removed in 2021.\n"); + pr_warn("using deprecated initrd support, will be removed soon.\n"); real_root_dev = new_encode_dev(ROOT_DEV); create_dev("/dev/root.old", Root_RAM0); diff --git a/init/init_task.c b/init/init_task.c index 7ecb458eb3da..e557f622bd90 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -6,6 +6,7 @@ #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> #include <linux/sched/task.h> +#include <linux/sched/ext.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/mm.h> @@ -29,8 +30,9 @@ static struct signal_struct init_signals = { .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), .exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock), #ifdef CONFIG_POSIX_TIMERS - .posix_timers = LIST_HEAD_INIT(init_signals.posix_timers), - .cputimer = { + .posix_timers = HLIST_HEAD_INIT, + .ignored_posix_timers = HLIST_HEAD_INIT, + .cputimer = { .cputime_atomic = INIT_CPUTIME_ATOMIC, }, #endif @@ -77,6 +79,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { .cpus_ptr = &init_task.cpus_mask, .user_cpus_ptr = NULL, .cpus_mask = CPU_MASK_ALL, + .max_allowed_capacity = SCHED_CAPACITY_SCALE, .nr_cpus_allowed= NR_CPUS, .mm = NULL, .active_mm = &init_mm, @@ -98,6 +101,17 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { #ifdef CONFIG_CGROUP_SCHED .sched_task_group = &root_task_group, #endif +#ifdef CONFIG_SCHED_CLASS_EXT + .scx = { + .dsq_list.node = LIST_HEAD_INIT(init_task.scx.dsq_list.node), + .sticky_cpu = -1, + .holding_cpu = -1, + .runnable_node = LIST_HEAD_INIT(init_task.scx.runnable_node), + .runnable_at = INITIAL_JIFFIES, + .ddsp_dsq_id = SCX_DSQ_INVALID, + .slice = SCX_SLICE_DFL, + }, +#endif .ptraced = LIST_HEAD_INIT(init_task.ptraced), .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), .real_parent = &init_task, @@ -147,6 +161,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { .rcu_tasks_holdout = false, .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), .rcu_tasks_idle_cpu = -1, + .rcu_tasks_exit_list = LIST_HEAD_INIT(init_task.rcu_tasks_exit_list), #endif #ifdef CONFIG_TASKS_TRACE_RCU .trc_reader_nesting = 0, @@ -197,7 +212,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { .trace_recursion = 0, #endif #ifdef CONFIG_LIVEPATCH - .patch_state = KLP_UNDEFINED, + .patch_state = KLP_TRANSITION_IDLE, #endif #ifdef CONFIG_SECURITY .security = NULL, diff --git a/init/initramfs.c b/init/initramfs.c index 76deb48c38cb..72bad44a1d41 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -16,8 +16,11 @@ #include <linux/mm.h> #include <linux/namei.h> #include <linux/init_syscalls.h> -#include <linux/task_work.h> #include <linux/umh.h> +#include <linux/security.h> + +#include "do_mounts.h" +#include "initramfs_internal.h" static __initdata bool csum_present; static __initdata u32 io_csum; @@ -73,6 +76,7 @@ static __initdata struct hash { struct hash *next; char name[N_ALIGN(PATH_MAX)]; } *head[32]; +static __initdata bool hardlink_seen; static inline int hash(int major, int minor, int ino) { @@ -106,19 +110,21 @@ static char __init *find_link(int major, int minor, int ino, strcpy(q->name, name); q->next = NULL; *p = q; + hardlink_seen = true; return NULL; } static void __init free_hash(void) { struct hash **p, *q; - for (p = head; p < head + 32; p++) { + for (p = head; hardlink_seen && p < head + 32; p++) { while (*p) { q = *p; *p = q->next; kfree(q); } } + hardlink_seen = false; } #ifdef CONFIG_INITRAMFS_PRESERVE_MTIME @@ -141,9 +147,8 @@ struct dir_entry { char name[]; }; -static void __init dir_add(const char *name, time64_t mtime) +static void __init dir_add(const char *name, size_t nlen, time64_t mtime) { - size_t nlen = strlen(name) + 1; struct dir_entry *de; de = kmalloc(sizeof(struct dir_entry) + nlen, GFP_KERNEL); @@ -167,7 +172,7 @@ static void __init dir_utime(void) #else static void __init do_utime(char *filename, time64_t mtime) {} static void __init do_utime_path(const struct path *path, time64_t mtime) {} -static void __init dir_add(const char *name, time64_t mtime) {} +static void __init dir_add(const char *name, size_t nlen, time64_t mtime) {} static void __init dir_utime(void) {} #endif @@ -186,14 +191,11 @@ static __initdata u32 hdr_csum; static void __init parse_header(char *s) { unsigned long parsed[13]; - char buf[9]; int i; - buf[8] = '\0'; - for (i = 0, s += 6; i < 13; i++, s += 8) { - memcpy(buf, s, 8); - parsed[i] = simple_strtoul(buf, NULL, 16); - } + for (i = 0, s += 6; i < 13; i++, s += 8) + parsed[i] = simple_strntoul(s, NULL, 16, 8); + ino = parsed[0]; mode = parsed[1]; uid = parsed[2]; @@ -254,7 +256,7 @@ static __initdata char *header_buf, *symlink_buf, *name_buf; static int __init do_start(void) { - read_into(header_buf, 110, GotHeader); + read_into(header_buf, CPIO_HDRLEN, GotHeader); return 0; } @@ -358,6 +360,15 @@ static int __init do_name(void) { state = SkipIt; next_state = Reset; + + /* name_len > 0 && name_len <= PATH_MAX checked in do_header */ + if (collected[name_len - 1] != '\0') { + pr_err("initramfs name without nulterm: %.*s\n", + (int)name_len, collected); + error("malformed archive"); + return 1; + } + if (strcmp(collected, "TRAILER!!!") == 0) { free_hash(); return 0; @@ -366,7 +377,7 @@ static int __init do_name(void) if (S_ISREG(mode)) { int ml = maybe_link(); if (ml >= 0) { - int openflags = O_WRONLY|O_CREAT; + int openflags = O_WRONLY|O_CREAT|O_LARGEFILE; if (ml != 1) openflags |= O_TRUNC; wfile = filp_open(collected, openflags, mode); @@ -385,7 +396,7 @@ static int __init do_name(void) init_mkdir(collected, mode); init_chown(collected, uid, gid, 0); init_chmod(collected, mode); - dir_add(collected, mtime); + dir_add(collected, name_len, mtime); } else if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { if (maybe_link() == 0) { @@ -422,6 +433,12 @@ static int __init do_copy(void) static int __init do_symlink(void) { + if (collected[name_len - 1] != '\0') { + pr_err("initramfs symlink without nulterm: %.*s\n", + (int)name_len, collected); + error("malformed archive"); + return 1; + } collected[N_ALIGN(name_len) + body_len] = '\0'; clean_path(collected, 0); init_symlink(collected + N_ALIGN(name_len), collected); @@ -480,20 +497,33 @@ static unsigned long my_inptr __initdata; /* index of next byte to be processed #include <linux/decompress/generic.h> -static char * __init unpack_to_rootfs(char *buf, unsigned long len) +/** + * unpack_to_rootfs - decompress and extract an initramfs archive + * @buf: input initramfs archive to extract + * @len: length of initramfs data to process + * + * Returns: NULL for success or an error message string + * + * This symbol shouldn't be used externally. It's available for unit tests. + */ +char * __init unpack_to_rootfs(char *buf, unsigned long len) { long written; decompress_fn decompress; const char *compress_name; - static __initdata char msg_buf[64]; - - header_buf = kmalloc(110, GFP_KERNEL); - symlink_buf = kmalloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1, GFP_KERNEL); - name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL); + struct { + char header[CPIO_HDRLEN]; + char symlink[PATH_MAX + N_ALIGN(PATH_MAX) + 1]; + char name[N_ALIGN(PATH_MAX)]; + } *bufs = kmalloc(sizeof(*bufs), GFP_KERNEL); - if (!header_buf || !symlink_buf || !name_buf) + if (!bufs) panic_show_mem("can't allocate buffers"); + header_buf = bufs->header; + symlink_buf = bufs->symlink; + name_buf = bufs->name; + state = Start; this_header = 0; message = NULL; @@ -521,12 +551,9 @@ static char * __init unpack_to_rootfs(char *buf, unsigned long len) if (res) error("decompressor failed"); } else if (compress_name) { - if (!message) { - snprintf(msg_buf, sizeof msg_buf, - "compression method %s not configured", - compress_name); - message = msg_buf; - } + pr_err("compression method %s not configured\n", + compress_name); + error("decompressor failed"); } else error("invalid magic at start of compressed archive"); if (state != Reset) @@ -536,9 +563,9 @@ static char * __init unpack_to_rootfs(char *buf, unsigned long len) len -= my_inptr; } dir_utime(); - kfree(name_buf); - kfree(symlink_buf); - kfree(header_buf); + /* free any hardlink state collected without optional TRAILER!!! */ + free_hash(); + kfree(bufs); return message; } @@ -574,15 +601,7 @@ extern unsigned long __initramfs_size; #include <linux/initrd.h> #include <linux/kexec.h> -static ssize_t raw_read(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, char *buf, - loff_t pos, size_t count) -{ - memcpy(buf, attr->private + pos, count); - return count; -} - -static BIN_ATTR(initrd, 0440, raw_read, NULL, 0); +static BIN_ATTR(initrd, 0440, sysfs_bin_attr_simple_read, NULL, 0); void __init reserve_initrd_mem(void) { @@ -642,7 +661,7 @@ void __weak __init free_initrd_mem(unsigned long start, unsigned long end) "initrd"); } -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_RESERVE static bool __init kexec_free_initrd(void) { unsigned long crashk_start = (unsigned long)__va(crashk_res.start); @@ -679,11 +698,9 @@ static void __init populate_initrd_image(char *err) struct file *file; loff_t pos = 0; - unpack_to_rootfs(__initramfs_start, __initramfs_size); - printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n", err); - file = filp_open("/initrd.image", O_WRONLY | O_CREAT, 0700); + file = filp_open("/initrd.image", O_WRONLY|O_CREAT|O_LARGEFILE, 0700); if (IS_ERR(file)) return; @@ -721,6 +738,8 @@ static void __init do_populate_rootfs(void *unused, async_cookie_t cookie) } done: + security_initramfs_populated(); + /* * If the initrd region is overlapped with crashkernel reserved region, * free only memory that is not part of crashkernel region. @@ -736,8 +755,7 @@ done: initrd_start = 0; initrd_end = 0; - flush_delayed_fput(); - task_work_run(); + init_flush_fput(); } static ASYNC_DOMAIN_EXCLUSIVE(initramfs_domain); diff --git a/init/initramfs_internal.h b/init/initramfs_internal.h new file mode 100644 index 000000000000..233dad16b0a0 --- /dev/null +++ b/init/initramfs_internal.h @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __INITRAMFS_INTERNAL_H__ +#define __INITRAMFS_INTERNAL_H__ + +char *unpack_to_rootfs(char *buf, unsigned long len); +#define CPIO_HDRLEN 110 + +#endif diff --git a/init/initramfs_test.c b/init/initramfs_test.c new file mode 100644 index 000000000000..517e5e04e5cc --- /dev/null +++ b/init/initramfs_test.c @@ -0,0 +1,407 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <kunit/test.h> +#include <linux/fcntl.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/init_syscalls.h> +#include <linux/stringify.h> +#include <linux/timekeeping.h> +#include "initramfs_internal.h" + +struct initramfs_test_cpio { + char *magic; + unsigned int ino; + unsigned int mode; + unsigned int uid; + unsigned int gid; + unsigned int nlink; + unsigned int mtime; + unsigned int filesize; + unsigned int devmajor; + unsigned int devminor; + unsigned int rdevmajor; + unsigned int rdevminor; + unsigned int namesize; + unsigned int csum; + char *fname; + char *data; +}; + +static size_t fill_cpio(struct initramfs_test_cpio *cs, size_t csz, char *out) +{ + int i; + size_t off = 0; + + for (i = 0; i < csz; i++) { + char *pos = &out[off]; + struct initramfs_test_cpio *c = &cs[i]; + size_t thislen; + + /* +1 to account for nulterm */ + thislen = sprintf(pos, "%s" + "%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x" + "%s", + c->magic, c->ino, c->mode, c->uid, c->gid, c->nlink, + c->mtime, c->filesize, c->devmajor, c->devminor, + c->rdevmajor, c->rdevminor, c->namesize, c->csum, + c->fname) + 1; + pr_debug("packing (%zu): %.*s\n", thislen, (int)thislen, pos); + off += thislen; + while (off & 3) + out[off++] = '\0'; + + memcpy(&out[off], c->data, c->filesize); + off += c->filesize; + while (off & 3) + out[off++] = '\0'; + } + + return off; +} + +static void __init initramfs_test_extract(struct kunit *test) +{ + char *err, *cpio_srcbuf; + size_t len; + struct timespec64 ts_before, ts_after; + struct kstat st = {}; + struct initramfs_test_cpio c[] = { { + .magic = "070701", + .ino = 1, + .mode = S_IFREG | 0777, + .uid = 12, + .gid = 34, + .nlink = 1, + .mtime = 56, + .filesize = 0, + .devmajor = 0, + .devminor = 1, + .rdevmajor = 0, + .rdevminor = 0, + .namesize = sizeof("initramfs_test_extract"), + .csum = 0, + .fname = "initramfs_test_extract", + }, { + .magic = "070701", + .ino = 2, + .mode = S_IFDIR | 0777, + .nlink = 1, + .mtime = 57, + .devminor = 1, + .namesize = sizeof("initramfs_test_extract_dir"), + .fname = "initramfs_test_extract_dir", + }, { + .magic = "070701", + .namesize = sizeof("TRAILER!!!"), + .fname = "TRAILER!!!", + } }; + + /* +3 to cater for any 4-byte end-alignment */ + cpio_srcbuf = kzalloc(ARRAY_SIZE(c) * (CPIO_HDRLEN + PATH_MAX + 3), + GFP_KERNEL); + len = fill_cpio(c, ARRAY_SIZE(c), cpio_srcbuf); + + ktime_get_real_ts64(&ts_before); + err = unpack_to_rootfs(cpio_srcbuf, len); + ktime_get_real_ts64(&ts_after); + if (err) { + KUNIT_FAIL(test, "unpack failed %s", err); + goto out; + } + + KUNIT_EXPECT_EQ(test, init_stat(c[0].fname, &st, 0), 0); + KUNIT_EXPECT_TRUE(test, S_ISREG(st.mode)); + KUNIT_EXPECT_TRUE(test, uid_eq(st.uid, KUIDT_INIT(c[0].uid))); + KUNIT_EXPECT_TRUE(test, gid_eq(st.gid, KGIDT_INIT(c[0].gid))); + KUNIT_EXPECT_EQ(test, st.nlink, 1); + if (IS_ENABLED(CONFIG_INITRAMFS_PRESERVE_MTIME)) { + KUNIT_EXPECT_EQ(test, st.mtime.tv_sec, c[0].mtime); + } else { + KUNIT_EXPECT_GE(test, st.mtime.tv_sec, ts_before.tv_sec); + KUNIT_EXPECT_LE(test, st.mtime.tv_sec, ts_after.tv_sec); + } + KUNIT_EXPECT_EQ(test, st.blocks, c[0].filesize); + + KUNIT_EXPECT_EQ(test, init_stat(c[1].fname, &st, 0), 0); + KUNIT_EXPECT_TRUE(test, S_ISDIR(st.mode)); + if (IS_ENABLED(CONFIG_INITRAMFS_PRESERVE_MTIME)) { + KUNIT_EXPECT_EQ(test, st.mtime.tv_sec, c[1].mtime); + } else { + KUNIT_EXPECT_GE(test, st.mtime.tv_sec, ts_before.tv_sec); + KUNIT_EXPECT_LE(test, st.mtime.tv_sec, ts_after.tv_sec); + } + + KUNIT_EXPECT_EQ(test, init_unlink(c[0].fname), 0); + KUNIT_EXPECT_EQ(test, init_rmdir(c[1].fname), 0); +out: + kfree(cpio_srcbuf); +} + +/* + * Don't terminate filename. Previously, the cpio filename field was passed + * directly to filp_open(collected, O_CREAT|..) without nulterm checks. See + * https://lore.kernel.org/linux-fsdevel/20241030035509.20194-2-ddiss@suse.de + */ +static void __init initramfs_test_fname_overrun(struct kunit *test) +{ + char *err, *cpio_srcbuf; + size_t len, suffix_off; + struct initramfs_test_cpio c[] = { { + .magic = "070701", + .ino = 1, + .mode = S_IFREG | 0777, + .uid = 0, + .gid = 0, + .nlink = 1, + .mtime = 1, + .filesize = 0, + .devmajor = 0, + .devminor = 1, + .rdevmajor = 0, + .rdevminor = 0, + .namesize = sizeof("initramfs_test_fname_overrun"), + .csum = 0, + .fname = "initramfs_test_fname_overrun", + } }; + + /* + * poison cpio source buffer, so we can detect overrun. source + * buffer is used by read_into() when hdr or fname + * are already available (e.g. no compression). + */ + cpio_srcbuf = kmalloc(CPIO_HDRLEN + PATH_MAX + 3, GFP_KERNEL); + memset(cpio_srcbuf, 'B', CPIO_HDRLEN + PATH_MAX + 3); + /* limit overrun to avoid crashes / filp_open() ENAMETOOLONG */ + cpio_srcbuf[CPIO_HDRLEN + strlen(c[0].fname) + 20] = '\0'; + + len = fill_cpio(c, ARRAY_SIZE(c), cpio_srcbuf); + /* overwrite trailing fname terminator and padding */ + suffix_off = len - 1; + while (cpio_srcbuf[suffix_off] == '\0') { + cpio_srcbuf[suffix_off] = 'P'; + suffix_off--; + } + + err = unpack_to_rootfs(cpio_srcbuf, len); + KUNIT_EXPECT_NOT_NULL(test, err); + + kfree(cpio_srcbuf); +} + +static void __init initramfs_test_data(struct kunit *test) +{ + char *err, *cpio_srcbuf; + size_t len; + struct file *file; + struct initramfs_test_cpio c[] = { { + .magic = "070701", + .ino = 1, + .mode = S_IFREG | 0777, + .uid = 0, + .gid = 0, + .nlink = 1, + .mtime = 1, + .filesize = sizeof("ASDF") - 1, + .devmajor = 0, + .devminor = 1, + .rdevmajor = 0, + .rdevminor = 0, + .namesize = sizeof("initramfs_test_data"), + .csum = 0, + .fname = "initramfs_test_data", + .data = "ASDF", + } }; + + /* +6 for max name and data 4-byte padding */ + cpio_srcbuf = kmalloc(CPIO_HDRLEN + c[0].namesize + c[0].filesize + 6, + GFP_KERNEL); + + len = fill_cpio(c, ARRAY_SIZE(c), cpio_srcbuf); + + err = unpack_to_rootfs(cpio_srcbuf, len); + KUNIT_EXPECT_NULL(test, err); + + file = filp_open(c[0].fname, O_RDONLY, 0); + if (IS_ERR(file)) { + KUNIT_FAIL(test, "open failed"); + goto out; + } + + /* read back file contents into @cpio_srcbuf and confirm match */ + len = kernel_read(file, cpio_srcbuf, c[0].filesize, NULL); + KUNIT_EXPECT_EQ(test, len, c[0].filesize); + KUNIT_EXPECT_MEMEQ(test, cpio_srcbuf, c[0].data, len); + + fput(file); + KUNIT_EXPECT_EQ(test, init_unlink(c[0].fname), 0); +out: + kfree(cpio_srcbuf); +} + +static void __init initramfs_test_csum(struct kunit *test) +{ + char *err, *cpio_srcbuf; + size_t len; + struct initramfs_test_cpio c[] = { { + /* 070702 magic indicates a valid csum is present */ + .magic = "070702", + .ino = 1, + .mode = S_IFREG | 0777, + .nlink = 1, + .filesize = sizeof("ASDF") - 1, + .devminor = 1, + .namesize = sizeof("initramfs_test_csum"), + .csum = 'A' + 'S' + 'D' + 'F', + .fname = "initramfs_test_csum", + .data = "ASDF", + }, { + /* mix csum entry above with no-csum entry below */ + .magic = "070701", + .ino = 2, + .mode = S_IFREG | 0777, + .nlink = 1, + .filesize = sizeof("ASDF") - 1, + .devminor = 1, + .namesize = sizeof("initramfs_test_csum_not_here"), + /* csum ignored */ + .csum = 5555, + .fname = "initramfs_test_csum_not_here", + .data = "ASDF", + } }; + + cpio_srcbuf = kmalloc(8192, GFP_KERNEL); + + len = fill_cpio(c, ARRAY_SIZE(c), cpio_srcbuf); + + err = unpack_to_rootfs(cpio_srcbuf, len); + KUNIT_EXPECT_NULL(test, err); + + KUNIT_EXPECT_EQ(test, init_unlink(c[0].fname), 0); + KUNIT_EXPECT_EQ(test, init_unlink(c[1].fname), 0); + + /* mess up the csum and confirm that unpack fails */ + c[0].csum--; + len = fill_cpio(c, ARRAY_SIZE(c), cpio_srcbuf); + + err = unpack_to_rootfs(cpio_srcbuf, len); + KUNIT_EXPECT_NOT_NULL(test, err); + + /* + * file (with content) is still retained in case of bad-csum abort. + * Perhaps we should change this. + */ + KUNIT_EXPECT_EQ(test, init_unlink(c[0].fname), 0); + KUNIT_EXPECT_EQ(test, init_unlink(c[1].fname), -ENOENT); + kfree(cpio_srcbuf); +} + +/* + * hardlink hashtable may leak when the archive omits a trailer: + * https://lore.kernel.org/r/20241107002044.16477-10-ddiss@suse.de/ + */ +static void __init initramfs_test_hardlink(struct kunit *test) +{ + char *err, *cpio_srcbuf; + size_t len; + struct kstat st0, st1; + struct initramfs_test_cpio c[] = { { + .magic = "070701", + .ino = 1, + .mode = S_IFREG | 0777, + .nlink = 2, + .devminor = 1, + .namesize = sizeof("initramfs_test_hardlink"), + .fname = "initramfs_test_hardlink", + }, { + /* hardlink data is present in last archive entry */ + .magic = "070701", + .ino = 1, + .mode = S_IFREG | 0777, + .nlink = 2, + .filesize = sizeof("ASDF") - 1, + .devminor = 1, + .namesize = sizeof("initramfs_test_hardlink_link"), + .fname = "initramfs_test_hardlink_link", + .data = "ASDF", + } }; + + cpio_srcbuf = kmalloc(8192, GFP_KERNEL); + + len = fill_cpio(c, ARRAY_SIZE(c), cpio_srcbuf); + + err = unpack_to_rootfs(cpio_srcbuf, len); + KUNIT_EXPECT_NULL(test, err); + + KUNIT_EXPECT_EQ(test, init_stat(c[0].fname, &st0, 0), 0); + KUNIT_EXPECT_EQ(test, init_stat(c[1].fname, &st1, 0), 0); + KUNIT_EXPECT_EQ(test, st0.ino, st1.ino); + KUNIT_EXPECT_EQ(test, st0.nlink, 2); + KUNIT_EXPECT_EQ(test, st1.nlink, 2); + + KUNIT_EXPECT_EQ(test, init_unlink(c[0].fname), 0); + KUNIT_EXPECT_EQ(test, init_unlink(c[1].fname), 0); + + kfree(cpio_srcbuf); +} + +#define INITRAMFS_TEST_MANY_LIMIT 1000 +#define INITRAMFS_TEST_MANY_PATH_MAX (sizeof("initramfs_test_many-") \ + + sizeof(__stringify(INITRAMFS_TEST_MANY_LIMIT))) +static void __init initramfs_test_many(struct kunit *test) +{ + char *err, *cpio_srcbuf, *p; + size_t len = INITRAMFS_TEST_MANY_LIMIT * + (CPIO_HDRLEN + INITRAMFS_TEST_MANY_PATH_MAX + 3); + char thispath[INITRAMFS_TEST_MANY_PATH_MAX]; + int i; + + p = cpio_srcbuf = kmalloc(len, GFP_KERNEL); + + for (i = 0; i < INITRAMFS_TEST_MANY_LIMIT; i++) { + struct initramfs_test_cpio c = { + .magic = "070701", + .ino = i, + .mode = S_IFREG | 0777, + .nlink = 1, + .devminor = 1, + .fname = thispath, + }; + + c.namesize = 1 + sprintf(thispath, "initramfs_test_many-%d", i); + p += fill_cpio(&c, 1, p); + } + + len = p - cpio_srcbuf; + err = unpack_to_rootfs(cpio_srcbuf, len); + KUNIT_EXPECT_NULL(test, err); + + for (i = 0; i < INITRAMFS_TEST_MANY_LIMIT; i++) { + sprintf(thispath, "initramfs_test_many-%d", i); + KUNIT_EXPECT_EQ(test, init_unlink(thispath), 0); + } + + kfree(cpio_srcbuf); +} + +/* + * The kunit_case/_suite struct cannot be marked as __initdata as this will be + * used in debugfs to retrieve results after test has run. + */ +static struct kunit_case __refdata initramfs_test_cases[] = { + KUNIT_CASE(initramfs_test_extract), + KUNIT_CASE(initramfs_test_fname_overrun), + KUNIT_CASE(initramfs_test_data), + KUNIT_CASE(initramfs_test_csum), + KUNIT_CASE(initramfs_test_hardlink), + KUNIT_CASE(initramfs_test_many), + {}, +}; + +static struct kunit_suite initramfs_test_suite = { + .name = "initramfs", + .test_cases = initramfs_test_cases, +}; +kunit_test_init_section_suites(&initramfs_test_suite); + +MODULE_DESCRIPTION("Initramfs KUnit test suite"); +MODULE_LICENSE("GPL v2"); diff --git a/init/main.c b/init/main.c index e24b0780fdff..ed576c7f475d 100644 --- a/init/main.c +++ b/init/main.c @@ -50,6 +50,7 @@ #include <linux/writeback.h> #include <linux/cpu.h> #include <linux/cpuset.h> +#include <linux/memcontrol.h> #include <linux/cgroup.h> #include <linux/efi.h> #include <linux/tick.h> @@ -88,6 +89,7 @@ #include <linux/sched/task_stack.h> #include <linux/context_tracking.h> #include <linux/random.h> +#include <linux/moduleloader.h> #include <linux/list.h> #include <linux/integrity.h> #include <linux/proc_ns.h> @@ -99,6 +101,8 @@ #include <linux/init_syscalls.h> #include <linux/stackdepot.h> #include <linux/randomize_kstack.h> +#include <linux/pidfs.h> +#include <linux/ptdump.h> #include <net/net_namespace.h> #include <asm/io.h> @@ -324,7 +328,7 @@ static int __init xbc_snprint_cmdline(char *buf, size_t size, { struct xbc_node *knode, *vnode; char *end = buf + size; - const char *val; + const char *val, *q; int ret; xbc_node_for_each_key_value(root, knode, val) { @@ -342,8 +346,14 @@ static int __init xbc_snprint_cmdline(char *buf, size_t size, continue; } xbc_array_for_each_value(vnode, val) { - ret = snprintf(buf, rest(buf, end), "%s=\"%s\" ", - xbc_namebuf, val); + /* + * For prettier and more readable /proc/cmdline, only + * quote the value when necessary, i.e. when it contains + * whitespace. + */ + q = strpbrk(val, " \t\r\n") ? "\"" : ""; + ret = snprintf(buf, rest(buf, end), "%s=%s%s%s ", + xbc_namebuf, q, val, q); if (ret < 0) return ret; buf += ret; @@ -484,6 +494,11 @@ static int __init warn_bootconfig(char *str) early_param("bootconfig", warn_bootconfig); +bool __init cmdline_has_extra_options(void) +{ + return extra_command_line || extra_init_args; +} + /* Change NUL term back to "=", to make "param" the whole string. */ static void __init repair_env_string(char *param, char *val) { @@ -603,7 +618,6 @@ static int __init rdinit_setup(char *str) __setup("rdinit=", rdinit_setup); #ifndef CONFIG_SMP -static const unsigned int setup_max_cpus = NR_CPUS; static inline void setup_nr_cpu_ids(void) { } static inline void smp_prepare_cpus(unsigned int maxcpus) { } #endif @@ -620,18 +634,18 @@ static void __init setup_command_line(char *command_line) if (extra_command_line) xlen = strlen(extra_command_line); - if (extra_init_args) + if (extra_init_args) { + extra_init_args = strim(extra_init_args); /* remove trailing space */ ilen = strlen(extra_init_args) + 4; /* for " -- " */ + } - len = xlen + strlen(boot_command_line) + 1; + len = xlen + strlen(boot_command_line) + ilen + 1; - saved_command_line = memblock_alloc(len + ilen, SMP_CACHE_BYTES); - if (!saved_command_line) - panic("%s: Failed to allocate %zu bytes\n", __func__, len + ilen); + saved_command_line = memblock_alloc_or_panic(len, SMP_CACHE_BYTES); - static_command_line = memblock_alloc(len, SMP_CACHE_BYTES); - if (!static_command_line) - panic("%s: Failed to allocate %zu bytes\n", __func__, len); + len = xlen + strlen(command_line) + 1; + + static_command_line = memblock_alloc_or_panic(len, SMP_CACHE_BYTES); if (xlen) { /* @@ -681,7 +695,7 @@ static void __init setup_command_line(char *command_line) static __initdata DECLARE_COMPLETION(kthreadd_done); -noinline void __ref __noreturn rest_init(void) +static noinline void __ref __noreturn rest_init(void) { struct task_struct *tsk; int pid; @@ -737,10 +751,7 @@ static int __init do_early_param(char *param, char *val, const struct obs_kernel_param *p; for (p = __setup_start; p < __setup_end; p++) { - if ((p->early && parameq(param, p->str)) || - (strcmp(param, "console") == 0 && - strcmp(p->str, "earlycon") == 0) - ) { + if (p->early && parameq(param, p->str)) { if (p->setup_func(val) != 0) pr_warn("Malformed early option '%s'\n", param); } @@ -776,6 +787,10 @@ void __init __weak smp_setup_processor_id(void) { } +void __init __weak smp_prepare_boot_cpu(void) +{ +} + # if THREAD_SIZE >= PAGE_SIZE void __init __weak thread_stack_cache_init(void) { @@ -822,11 +837,6 @@ static int __init early_randomize_kstack_offset(char *buf) early_param("randomize_kstack_offset", early_randomize_kstack_offset); #endif -void __init __weak __noreturn arch_call_rest_init(void) -{ - rest_init(); -} - static void __init print_unknown_bootoptions(void) { char *unknown_options; @@ -870,6 +880,19 @@ static void __init print_unknown_bootoptions(void) memblock_free(unknown_options, len); } +static void __init early_numa_node_init(void) +{ +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID +#ifndef cpu_to_node + int cpu; + + /* The early_cpu_to_node() should be ready here. */ + for_each_possible_cpu(cpu) + set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); +#endif +#endif +} + asmlinkage __visible __init __no_sanitize_address __noreturn __no_stack_protector void start_kernel(void) { @@ -893,18 +916,21 @@ void start_kernel(void) boot_cpu_init(); page_address_init(); pr_notice("%s", linux_banner); - early_security_init(); setup_arch(&command_line); + /* Static keys and static calls are needed by LSMs */ + jump_label_init(); + static_call_init(); + early_security_init(); setup_boot_config(); setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ + early_numa_node_init(); boot_cpu_hotplug_init(); pr_notice("Kernel command line: %s\n", saved_command_line); /* parameters may set static keys */ - jump_label_init(); parse_early_param(); after_dashes = parse_args("Booting kernel", static_command_line, __start___param, @@ -963,6 +989,7 @@ void start_kernel(void) workqueue_init_early(); rcu_init(); + kvfree_rcu_init(); /* Trace events are available after this */ trace_init(); @@ -976,7 +1003,7 @@ void start_kernel(void) init_IRQ(); tick_init(); rcu_init_nohz(); - init_timers(); + timers_init(); srcu_init(); hrtimers_init(); softirq_init(); @@ -1059,7 +1086,9 @@ void start_kernel(void) seq_file_init(); proc_root_init(); nsfs_init(); + pidfs_init(); cpuset_init(); + mem_cgroup_init(); cgroup_init(); taskstats_init_early(); delayacct_init(); @@ -1069,7 +1098,7 @@ void start_kernel(void) kcsan_init(); /* Do the rest non-__init'ed, we're now alive */ - arch_call_rest_init(); + rest_init(); /* * Avoid stack canaries in callers of boot_init_stack_canary for gcc-10 @@ -1115,16 +1144,10 @@ static int __init initcall_blacklist(char *str) str_entry = strsep(&str, ","); if (str_entry) { pr_debug("blacklisting initcall %s\n", str_entry); - entry = memblock_alloc(sizeof(*entry), + entry = memblock_alloc_or_panic(sizeof(*entry), SMP_CACHE_BYTES); - if (!entry) - panic("%s: Failed to allocate %zu bytes\n", - __func__, sizeof(*entry)); - entry->buf = memblock_alloc(strlen(str_entry) + 1, + entry->buf = memblock_alloc_or_panic(strlen(str_entry) + 1, SMP_CACHE_BYTES); - if (!entry->buf) - panic("%s: Failed to allocate %zu bytes\n", - __func__, strlen(str_entry) + 1); strcpy(entry->buf, str_entry); list_add(&entry->next, &blacklisted_initcalls); } @@ -1193,6 +1216,12 @@ trace_initcall_finish_cb(void *data, initcall_t fn, int ret) fn, ret, (unsigned long long)ktime_us_delta(rettime, *calltime)); } +static __init_or_module void +trace_initcall_level_cb(void *data, const char *level) +{ + printk(KERN_DEBUG "entering initcall level: %s\n", level); +} + static ktime_t initcall_calltime; #ifdef TRACEPOINTS_ENABLED @@ -1204,10 +1233,12 @@ static void __init initcall_debug_enable(void) &initcall_calltime); ret |= register_trace_initcall_finish(trace_initcall_finish_cb, &initcall_calltime); + ret |= register_trace_initcall_level(trace_initcall_level_cb, NULL); WARN(ret, "Failed to register initcall tracepoints\n"); } # define do_trace_initcall_start trace_initcall_start # define do_trace_initcall_finish trace_initcall_finish +# define do_trace_initcall_level trace_initcall_level #else static inline void do_trace_initcall_start(initcall_t fn) { @@ -1221,6 +1252,12 @@ static inline void do_trace_initcall_finish(initcall_t fn, int ret) return; trace_initcall_finish_cb(&initcall_calltime, fn, ret); } +static inline void do_trace_initcall_level(const char *level) +{ + if (!initcall_debug) + return; + trace_initcall_level_cb(NULL, level); +} #endif /* !TRACEPOINTS_ENABLED */ int __init_or_module do_one_initcall(initcall_t fn) @@ -1293,7 +1330,7 @@ static void __init do_initcall_level(int level, char *command_line) level, level, NULL, ignore_unknown_bootoption); - trace_initcall_level(initcall_level_names[level]); + do_trace_initcall_level(initcall_level_names[level]); for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++) do_one_initcall(initcall_from_entry(fn)); } @@ -1337,7 +1374,7 @@ static void __init do_pre_smp_initcalls(void) { initcall_entry_t *fn; - trace_initcall_level("early"); + do_trace_initcall_level("early"); for (fn = __initcall_start; fn < __initcall0_start; fn++) do_one_initcall(initcall_from_entry(fn)); } @@ -1396,33 +1433,28 @@ static int __init set_debug_rodata(char *str) early_param("rodata", set_debug_rodata); #endif -#ifdef CONFIG_STRICT_KERNEL_RWX static void mark_readonly(void) { - if (rodata_enabled) { + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && rodata_enabled) { /* * load_module() results in W+X mappings, which are cleaned - * up with call_rcu(). Let's make sure that queued work is + * up with init_free_wq. Let's make sure that queued work is * flushed so that we don't hit false positives looking for * insecure pages which are W+X. */ - rcu_barrier(); + flush_module_init_free_work(); + jump_label_init_ro(); mark_rodata_ro(); + debug_checkwx(); rodata_test(); - } else + } else if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) { pr_info("Kernel memory protection disabled.\n"); + } else if (IS_ENABLED(CONFIG_ARCH_HAS_STRICT_KERNEL_RWX)) { + pr_warn("Kernel memory protection not selected by kernel config.\n"); + } else { + pr_warn("This architecture does not have kernel memory protection.\n"); + } } -#elif defined(CONFIG_ARCH_HAS_STRICT_KERNEL_RWX) -static inline void mark_readonly(void) -{ - pr_warn("Kernel memory protection not selected by kernel config.\n"); -} -#else -static inline void mark_readonly(void) -{ - pr_warn("This architecture does not have kernel memory protection.\n"); -} -#endif void __weak free_initmem(void) { @@ -1537,7 +1569,6 @@ static noinline void __init kernel_init_freeable(void) init_mm_internals(); - rcu_init_tasks_generic(); do_pre_smp_initcalls(); lockup_detector_init(); @@ -1545,6 +1576,7 @@ static noinline void __init kernel_init_freeable(void) sched_init_smp(); workqueue_init_topology(); + async_init(); padata_init(); page_alloc_init_late(); |