summaryrefslogtreecommitdiff
path: root/init
diff options
context:
space:
mode:
Diffstat (limited to 'init')
-rw-r--r--init/.gitignore2
-rw-r--r--init/.kunitconfig3
-rw-r--r--init/Kconfig2356
-rw-r--r--init/Makefile64
-rw-r--r--init/calibrate.c38
-rw-r--r--init/do_mounts.c676
-rw-r--r--init/do_mounts.h64
-rw-r--r--init/do_mounts_initrd.c135
-rw-r--r--init/do_mounts_md.c312
-rw-r--r--init/do_mounts_rd.c160
-rw-r--r--init/init_task.c255
-rw-r--r--init/initramfs.c618
-rw-r--r--init/initramfs_internal.h8
-rw-r--r--init/initramfs_test.c472
-rw-r--r--init/main.c1356
-rw-r--r--init/noinitramfs.c24
-rw-r--r--init/version-timestamp.c26
-rw-r--r--init/version.c66
18 files changed, 4193 insertions, 2442 deletions
diff --git a/init/.gitignore b/init/.gitignore
new file mode 100644
index 000000000000..cbbe270cec00
--- /dev/null
+++ b/init/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/utsversion-tmp.h
diff --git a/init/.kunitconfig b/init/.kunitconfig
new file mode 100644
index 000000000000..acb906b1a5f9
--- /dev/null
+++ b/init/.kunitconfig
@@ -0,0 +1,3 @@
+CONFIG_KUNIT=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_INITRAMFS_TEST=y
diff --git a/init/Kconfig b/init/Kconfig
index 247084be0590..fa79feb8fe57 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1,35 +1,205 @@
-config ARCH
+# SPDX-License-Identifier: GPL-2.0-only
+config CC_VERSION_TEXT
string
- option env="ARCH"
+ default "$(CC_VERSION_TEXT)"
+ help
+ This is used in unclear ways:
-config KERNELVERSION
- string
- option env="KERNELVERSION"
+ - Re-run Kconfig when the compiler is updated
+ The 'default' property references the environment variable,
+ CC_VERSION_TEXT so it is recorded in include/config/auto.conf.cmd.
+ When the compiler is updated, Kconfig will be invoked.
-config DEFCONFIG_LIST
- string
- depends on !UML
- option defconfig_list
- default "/lib/modules/$UNAME_RELEASE/.config"
- default "/etc/kernel-config"
- default "/boot/config-$UNAME_RELEASE"
- default "$ARCH_DEFCONFIG"
- default "arch/$ARCH/defconfig"
+ - Ensure full rebuild when the compiler is updated
+ include/linux/compiler-version.h contains this option in the comment
+ line so fixdep adds include/config/CC_VERSION_TEXT into the
+ auto-generated dependency. When the compiler is updated, syncconfig
+ will touch it and then every file will be rebuilt.
+
+config CC_IS_GCC
+ def_bool $(success,test "$(cc-name)" = GCC)
+
+config GCC_VERSION
+ int
+ default $(cc-version) if CC_IS_GCC
+ default 0
+
+config CC_IS_CLANG
+ def_bool $(success,test "$(cc-name)" = Clang)
+
+config CLANG_VERSION
+ int
+ default $(cc-version) if CC_IS_CLANG
+ default 0
+
+config AS_IS_GNU
+ def_bool $(success,test "$(as-name)" = GNU)
+
+config AS_IS_LLVM
+ def_bool $(success,test "$(as-name)" = LLVM)
+
+config AS_VERSION
+ int
+ # Use clang version if this is the integrated assembler
+ default CLANG_VERSION if AS_IS_LLVM
+ default $(as-version)
+
+config LD_IS_BFD
+ def_bool $(success,test "$(ld-name)" = BFD)
+
+config LD_VERSION
+ int
+ default $(ld-version) if LD_IS_BFD
+ default 0
+
+config LD_IS_LLD
+ def_bool $(success,test "$(ld-name)" = LLD)
+
+config LLD_VERSION
+ int
+ default $(ld-version) if LD_IS_LLD
+ default 0
+
+config RUSTC_VERSION
+ int
+ default $(rustc-version)
+ help
+ It does not depend on `RUST` since that one may need to use the version
+ in a `depends on`.
+
+config RUST_IS_AVAILABLE
+ def_bool $(success,$(srctree)/scripts/rust_is_available.sh)
+ help
+ This shows whether a suitable Rust toolchain is available (found).
+
+ Please see Documentation/rust/quick-start.rst for instructions on how
+ to satisfy the build requirements of Rust support.
+
+ In particular, the Makefile target 'rustavailable' is useful to check
+ why the Rust toolchain is not being detected.
+
+config RUSTC_LLVM_VERSION
+ int
+ default $(rustc-llvm-version)
+
+config ARCH_HAS_CC_CAN_LINK
+ bool
+
+config CC_CAN_LINK
+ bool
+ default ARCH_CC_CAN_LINK if ARCH_HAS_CC_CAN_LINK
+ default $(cc_can_link_user,$(m64-flag)) if 64BIT
+ default $(cc_can_link_user,$(m32-flag))
+
+# Fixed in GCC 14, 13.3, 12.4 and 11.5
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113921
+config GCC_ASM_GOTO_OUTPUT_BROKEN
+ bool
+ depends on CC_IS_GCC
+ default y if GCC_VERSION < 110500
+ default y if GCC_VERSION >= 120000 && GCC_VERSION < 120400
+ default y if GCC_VERSION >= 130000 && GCC_VERSION < 130300
+
+config CC_HAS_ASM_GOTO_OUTPUT
+ def_bool y
+ depends on !GCC_ASM_GOTO_OUTPUT_BROKEN
+ # Detect basic support
+ depends on $(success,echo 'int foo(int x) { asm goto ("": "=r"(x) ::: bar); return x; bar: return 0; }' | $(CC) -x c - -c -o /dev/null)
+ # Detect clang (< v17) scoped label issues
+ depends on $(success,echo 'void b(void **);void* c(void);int f(void){{asm goto(""::::l0);return 0;l0:return 1;}void *x __attribute__((cleanup(b)))=c();{asm goto(""::::l1);return 2;l1:return 3;}}' | $(CC) -x c - -c -o /dev/null)
+
+config CC_HAS_ASM_GOTO_TIED_OUTPUT
+ depends on CC_HAS_ASM_GOTO_OUTPUT
+ # Detect buggy gcc and clang, fixed in gcc-11 clang-14.
+ def_bool $(success,echo 'int foo(int *x) { asm goto (".long (%l[bar]) - .": "+m"(*x) ::: bar); return *x; bar: return 0; }' | $CC -x c - -c -o /dev/null)
+
+config TOOLS_SUPPORT_RELR
+ def_bool $(success,env "CC=$(CC)" "LD=$(LD)" "NM=$(NM)" "OBJCOPY=$(OBJCOPY)" $(srctree)/scripts/tools-support-relr.sh)
+
+config CC_HAS_ASM_INLINE
+ def_bool $(success,echo 'void foo(void) { asm inline (""); }' | $(CC) -x c - -c -o /dev/null)
+
+config CC_HAS_ASSUME
+ bool
+ # clang needs to be at least 19.1.0 since the meaning of the assume
+ # attribute changed:
+ # https://github.com/llvm/llvm-project/commit/c44fa3e8a9a44c2e9a575768a3c185354b9f6c17
+ default y if CC_IS_CLANG && CLANG_VERSION >= 190100
+ # supported since gcc 13.1.0
+ # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106654
+ default y if CC_IS_GCC && GCC_VERSION >= 130100
+
+config CC_HAS_NO_PROFILE_FN_ATTR
+ def_bool $(success,echo '__attribute__((no_profile_instrument_function)) int x();' | $(CC) -x c - -c -o /dev/null -Werror)
+
+config CC_HAS_COUNTED_BY
+ bool
+ # clang needs to be at least 20.1.0 to avoid potential crashes
+ # when building structures that contain __counted_by
+ # https://github.com/ClangBuiltLinux/linux/issues/2114
+ # https://github.com/llvm/llvm-project/commit/160fb1121cdf703c3ef5e61fb26c5659eb581489
+ default y if CC_IS_CLANG && CLANG_VERSION >= 200100
+ # supported since gcc 15.1.0
+ # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896
+ default y if CC_IS_GCC && GCC_VERSION >= 150100
+
+config CC_HAS_MULTIDIMENSIONAL_NONSTRING
+ def_bool $(success,echo 'char tag[][4] __attribute__((__nonstring__)) = { };' | $(CC) $(CLANG_FLAGS) -x c - -c -o /dev/null -Werror)
+
+config LD_CAN_USE_KEEP_IN_OVERLAY
+ # ld.lld prior to 21.0.0 did not support KEEP within an overlay description
+ # https://github.com/llvm/llvm-project/pull/130661
+ def_bool LD_IS_BFD || LLD_VERSION >= 210000
+
+config RUSTC_HAS_SLICE_AS_FLATTENED
+ def_bool RUSTC_VERSION >= 108000
+
+config RUSTC_HAS_COERCE_POINTEE
+ def_bool RUSTC_VERSION >= 108400
+
+config RUSTC_HAS_SPAN_FILE
+ def_bool RUSTC_VERSION >= 108800
+
+config RUSTC_HAS_UNNECESSARY_TRANSMUTES
+ def_bool RUSTC_VERSION >= 108800
+
+config RUSTC_HAS_FILE_WITH_NUL
+ def_bool RUSTC_VERSION >= 108900
+
+config RUSTC_HAS_FILE_AS_C_STR
+ def_bool RUSTC_VERSION >= 109100
+
+config PAHOLE_VERSION
+ int
+ default $(shell,$(srctree)/scripts/pahole-version.sh $(PAHOLE))
config CONSTRUCTORS
bool
- depends on !UML
config IRQ_WORK
+ def_bool y if SMP
+
+config BUILDTIME_TABLE_SORT
bool
-config BUILDTIME_EXTABLE_SORT
+config THREAD_INFO_IN_TASK
bool
+ help
+ Select this to move thread_info off the stack into task_struct. To
+ make this work, an arch will need to remove all thread_info fields
+ except flags and fix any runtime bugs.
+
+ One subtle change that will be needed is to use try_get_task_stack()
+ and put_task_stack() in save_thread_stack_tsk() and get_wchan().
menu "General setup"
config BROKEN
bool
+ help
+ This option allows you to choose whether you want to try to
+ compile (and fix) old drivers that haven't been updated to
+ new infrastructure.
config BROKEN_ON_SMP
bool
@@ -44,18 +214,9 @@ config INIT_ENV_ARG_LIMIT
Maximum of each of the number of arguments and environment
variables passed to init from the kernel command line.
-
-config CROSS_COMPILE
- string "Cross-compiler tool prefix"
- help
- Same as running 'make CROSS_COMPILE=prefix-' but stored for
- default make runs in this kernel build directory. You don't
- need to set this unless you want the configured kernel build
- directory to select the cross-compiler automatically.
-
config COMPILE_TEST
bool "Compile also drivers which will not load"
- default n
+ depends on HAS_IOMEM
help
Some drivers can be compiled on a different platform than they are
intended to be run on. Despite they cannot be loaded there (or even
@@ -67,6 +228,33 @@ config COMPILE_TEST
here. If you are a user/distributor, say N here to exclude useless
drivers to be distributed.
+config WERROR
+ bool "Compile the kernel with warnings as errors"
+ default COMPILE_TEST
+ help
+ A kernel build should not cause any compiler warnings, and this
+ enables the '-Werror' (for C) and '-Dwarnings' (for Rust) flags
+ to enforce that rule by default. Certain warnings from other tools
+ such as the linker may be upgraded to errors with this option as
+ well.
+
+ However, if you have a new (or very old) compiler or linker with odd
+ and unusual warnings, or you have some architecture with problems,
+ you may need to disable this config option in order to
+ successfully build the kernel.
+
+ If in doubt, say Y.
+
+config UAPI_HEADER_TEST
+ bool "Compile test UAPI headers"
+ depends on HEADERS_INSTALL && CC_CAN_LINK
+ help
+ Compile test headers exported to user-space to ensure they are
+ self-contained, i.e. compilable as standalone units.
+
+ If you are a developer or tester and want to ensure the exported
+ headers are self-contained, say Y here. Otherwise, choose N.
+
config LOCALVERSION
string "Local version - append to kernel release"
help
@@ -80,6 +268,7 @@ config LOCALVERSION
config LOCALVERSION_AUTO
bool "Automatically append version information to the version string"
default y
+ depends on !COMPILE_TEST
help
This will try to automatically determine if the current tree is a
release tree by looking for git tags that belong to the current
@@ -90,13 +279,22 @@ config LOCALVERSION_AUTO
appended after any matching localversion* files, and after the value
set in CONFIG_LOCALVERSION.
- (The actual string used here is the first eight characters produced
+ (The actual string used here is the first 12 characters produced
by running the command:
$ git rev-parse --verify HEAD
which is done within the script "scripts/setlocalversion".)
+config BUILD_SALT
+ string "Build ID Salt"
+ default ""
+ help
+ The build ID is used to link binaries and their debug info. Setting
+ this option will use the value in the calculation of the build id.
+ This is mostly useful for distributions which want to ensure the
+ build is unique between builds. It's safe to leave the default.
+
config HAVE_KERNEL_GZIP
bool
@@ -115,10 +313,16 @@ config HAVE_KERNEL_LZO
config HAVE_KERNEL_LZ4
bool
+config HAVE_KERNEL_ZSTD
+ bool
+
+config HAVE_KERNEL_UNCOMPRESSED
+ bool
+
choice
prompt "Kernel compression mode"
default KERNEL_GZIP
- depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO || HAVE_KERNEL_LZ4
+ depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO || HAVE_KERNEL_LZ4 || HAVE_KERNEL_ZSTD || HAVE_KERNEL_UNCOMPRESSED
help
The linux kernel is a kind of self-extracting executable.
Several compression algorithms are available, which differ
@@ -170,8 +374,9 @@ config KERNEL_XZ
BCJ filters which can improve compression ratio of executable
code. The size of the kernel is about 30% smaller with XZ in
comparison to gzip. On architectures for which there is a BCJ
- filter (i386, x86_64, ARM, IA-64, PowerPC, and SPARC), XZ
- will create a few percent smaller kernel than plain LZMA.
+ filter (i386, x86_64, ARM, ARM64, RISC-V, big endian PowerPC,
+ and SPARC), XZ will create a few percent smaller kernel than
+ plain LZMA.
The speed is about the same as with LZMA: The decompression
speed of XZ is better than that of bzip2 but worse than gzip
@@ -197,8 +402,38 @@ config KERNEL_LZ4
is about 8% bigger than LZO. But the decompression speed is
faster than LZO.
+config KERNEL_ZSTD
+ bool "ZSTD"
+ depends on HAVE_KERNEL_ZSTD
+ help
+ ZSTD is a compression algorithm targeting intermediate compression
+ with fast decompression speed. It will compress better than GZIP and
+ decompress around the same speed as LZO, but slower than LZ4. You
+ will need at least 192 KB RAM or more for booting. The zstd command
+ line tool is required for compression.
+
+config KERNEL_UNCOMPRESSED
+ bool "None"
+ depends on HAVE_KERNEL_UNCOMPRESSED
+ help
+ Produce uncompressed kernel image. This option is usually not what
+ you want. It is useful for debugging the kernel in slow simulation
+ environments, where decompressing and moving the kernel is awfully
+ slow. This option allows early boot code to skip the decompressor
+ and jump right at uncompressed kernel image.
+
endchoice
+config DEFAULT_INIT
+ string "Default init path"
+ default ""
+ help
+ This option determines the default init for the system if no init=
+ option is passed on the kernel command line. If the requested path is
+ not present, we will still then move on to attempting further
+ locations (e.g. /sbin/init, etc). If this is empty, we will just use
+ the fallback list when init= is not passed.
+
config DEFAULT_HOSTNAME
string "Default hostname"
default "(none)"
@@ -208,19 +443,9 @@ config DEFAULT_HOSTNAME
but you may wish to use a different default here to make a minimal
system more usable with less configuration.
-config SWAP
- bool "Support for paging of anonymous memory (swap)"
- depends on MMU && BLOCK
- default y
- help
- This option allows you to choose whether you want to have support
- for so called swap devices or swap files in your kernel that are
- used to provide more virtual memory than the actual RAM present
- in your computer. If unsure say Y.
-
config SYSVIPC
bool "System V IPC"
- ---help---
+ help
Inter Process Communication is a suite of library functions and
system calls which let processes (running programs) synchronize and
exchange information. It is generally considered to be a good thing,
@@ -239,10 +464,14 @@ config SYSVIPC_SYSCTL
depends on SYSCTL
default y
+config SYSVIPC_COMPAT
+ def_bool y
+ depends on COMPAT && SYSVIPC
+
config POSIX_MQUEUE
bool "POSIX Message Queues"
depends on NET
- ---help---
+ help
POSIX variant of message queues is a part of IPC. In POSIX message
queues every message has a priority which decides about succession
of receiving it by a process. If you want to compile and run
@@ -261,17 +490,27 @@ config POSIX_MQUEUE_SYSCTL
depends on SYSCTL
default y
-config FHANDLE
- bool "open by fhandle syscalls"
- select EXPORTFS
+config WATCH_QUEUE
+ bool "General notification queue"
+ default n
help
- If you say Y here, a user level program will be able to map
- file names to handle and then later use the handle for
- different file system operations. This is useful in implementing
- userspace file servers, which now track files using handles instead
- of names. The handle would remain the same even if file names
- get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2)
- syscalls.
+
+ This is a general notification queue for the kernel to pass events to
+ userspace by splicing them into pipes. It can be used in conjunction
+ with watches for key/keyring change notifications and device
+ notifications.
+
+ See Documentation/core-api/watch_queue.rst
+
+config CROSS_MEMORY_ATTACH
+ bool "Enable process_vm_readv/writev syscalls"
+ depends on MMU
+ default y
+ help
+ Enabling this option adds the system calls process_vm_readv and
+ process_vm_writev which allow a process with the correct privileges
+ to directly read from or write to another process' address space.
+ See the man page for more details.
config AUDIT
bool "Auditing support"
@@ -279,44 +518,21 @@ config AUDIT
help
Enable auditing infrastructure that can be used with another
kernel subsystem, such as SELinux (which requires this for
- logging of avc messages output). Does not do system-call
- auditing without CONFIG_AUDITSYSCALL.
-
-config AUDITSYSCALL
- bool "Enable system-call auditing support"
- depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM && AEABI && !OABI_COMPAT))
- default y if SECURITY_SELINUX
- help
- Enable low-overhead system-call auditing infrastructure that
- can be used independently or with another kernel subsystem,
- such as SELinux.
+ logging of avc messages output). System call auditing is included
+ on architectures which support it.
-config AUDIT_WATCH
- def_bool y
- depends on AUDITSYSCALL
- select FSNOTIFY
+config HAVE_ARCH_AUDITSYSCALL
+ bool
-config AUDIT_TREE
+config AUDITSYSCALL
def_bool y
- depends on AUDITSYSCALL
+ depends on AUDIT && HAVE_ARCH_AUDITSYSCALL
select FSNOTIFY
-config AUDIT_LOGINUID_IMMUTABLE
- bool "Make audit loginuid immutable"
- depends on AUDIT
- help
- The config option toggles if a task setting its loginuid requires
- CAP_SYS_AUDITCONTROL or if that task should require no special permissions
- but should instead only allow setting its loginuid if it was never
- previously set. On systems which use systemd or a similar central
- process to restart login services this should be set to true. On older
- systems in which an admin would typically have to directly stop and
- start processes this should be set to false. Setting this to true allows
- one to drop potentially dangerous capabilites from the login tasks,
- but may not be backwards compatible with older init systems.
-
source "kernel/irq/Kconfig"
source "kernel/time/Kconfig"
+source "kernel/bpf/Kconfig"
+source "kernel/Kconfig.preempt"
menu "CPU/Task time and stats accounting"
@@ -325,8 +541,7 @@ config VIRT_CPU_ACCOUNTING
choice
prompt "Cputime accounting"
- default TICK_CPU_ACCOUNTING if !PPC64
- default VIRT_CPU_ACCOUNTING_NATIVE if PPC64
+ default TICK_CPU_ACCOUNTING
# Kind of a stub config for the pure tick based cputime accounting
config TICK_CPU_ACCOUNTING
@@ -354,9 +569,11 @@ config VIRT_CPU_ACCOUNTING_NATIVE
config VIRT_CPU_ACCOUNTING_GEN
bool "Full dynticks CPU time accounting"
- depends on HAVE_CONTEXT_TRACKING && 64BIT
+ depends on HAVE_CONTEXT_TRACKING_USER
+ depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
+ depends on GENERIC_CLOCKEVENTS
select VIRT_CPU_ACCOUNTING
- select CONTEXT_TRACKING
+ select CONTEXT_TRACKING_USER
help
Select this option to enable task and CPU time accounting on full
dynticks systems. This accounting is implemented by watching every
@@ -369,9 +586,11 @@ config VIRT_CPU_ACCOUNTING_GEN
If unsure, say N.
+endchoice
+
config IRQ_TIME_ACCOUNTING
bool "Fine granularity task level IRQ time accounting"
- depends on HAVE_IRQ_TIME_ACCOUNTING && !NO_HZ_FULL
+ depends on HAVE_IRQ_TIME_ACCOUNTING && !VIRT_CPU_ACCOUNTING_NATIVE
help
Select this option to enable fine granularity task irq time
accounting. This is done by reading a timestamp on each
@@ -380,10 +599,33 @@ config IRQ_TIME_ACCOUNTING
If in doubt, say N here.
-endchoice
+config HAVE_SCHED_AVG_IRQ
+ def_bool y
+ depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
+ depends on SMP
+
+config SCHED_HW_PRESSURE
+ bool
+ default y if ARM && ARM_CPU_TOPOLOGY
+ default y if ARM64
+ depends on SMP
+ depends on CPU_FREQ_THERMAL
+ help
+ Select this option to enable HW pressure accounting in the
+ scheduler. HW pressure is the value conveyed to the scheduler
+ that reflects the reduction in CPU compute capacity resulted from
+ HW throttling. HW throttling occurs when the performance of
+ a CPU is capped due to high operating temperatures as an example.
+
+ If selected, the scheduler will be able to balance tasks accordingly,
+ i.e. put less load on throttled CPUs than on non/less throttled ones.
+
+ This requires the architecture to implement
+ arch_update_hw_pressure() and arch_scale_thermal_pressure().
config BSD_PROCESS_ACCT
bool "BSD Process Accounting"
+ depends on MULTIUSER
help
If you say Y here, a user level program will be able to instruct the
kernel (via a special system call) to write process accounting
@@ -402,7 +644,7 @@ config BSD_PROCESS_ACCT_V3
help
If you say Y here, the process accounting information is written
in a new file format that also logs the process IDs of each
- process and it's parent. Note that this file format is incompatible
+ process and its parent. Note that this file format is incompatible
with previous v0/v1/v2 file formats, so you will need updated tools
for processing it. A preliminary version of these tools is available
at <http://www.gnu.org/software/acct/>.
@@ -410,6 +652,7 @@ config BSD_PROCESS_ACCT_V3
config TASKSTATS
bool "Export task/process statistics through netlink"
depends on NET
+ depends on MULTIUSER
default n
help
Export selected statistics for tasks/processes through the
@@ -423,6 +666,7 @@ config TASKSTATS
config TASK_DELAY_ACCT
bool "Enable per-task delay accounting"
depends on TASKSTATS
+ select SCHED_INFO
help
Collect information on time spent by a task waiting for system
resources like cpu, synchronous block I/O completion and swapping
@@ -449,316 +693,65 @@ config TASK_IO_ACCOUNTING
Say N if unsure.
-endmenu # "CPU/Task time and stats accounting"
-
-menu "RCU Subsystem"
-
-choice
- prompt "RCU Implementation"
- default TREE_RCU
-
-config TREE_RCU
- bool "Tree-based hierarchical RCU"
- depends on !PREEMPT && SMP
- select IRQ_WORK
+config PSI
+ bool "Pressure stall information tracking"
+ select KERNFS
help
- This option selects the RCU implementation that is
- designed for very large SMP system with hundreds or
- thousands of CPUs. It also scales down nicely to
- smaller systems.
+ Collect metrics that indicate how overcommitted the CPU, memory,
+ and IO capacity are in the system.
-config TREE_PREEMPT_RCU
- bool "Preemptible tree-based hierarchical RCU"
- depends on PREEMPT
- help
- This option selects the RCU implementation that is
- designed for very large SMP systems with hundreds or
- thousands of CPUs, but for which real-time response
- is also required. It also scales down nicely to
- smaller systems.
-
- Select this option if you are unsure.
+ If you say Y here, the kernel will create /proc/pressure/ with the
+ pressure statistics files cpu, memory, and io. These will indicate
+ the share of walltime in which some or all tasks in the system are
+ delayed due to contention of the respective resource.
-config TINY_RCU
- bool "UP-only small-memory-footprint RCU"
- depends on !PREEMPT && !SMP
- help
- This option selects the RCU implementation that is
- designed for UP systems from which real-time response
- is not required. This option greatly reduces the
- memory footprint of RCU.
-
-endchoice
-
-config PREEMPT_RCU
- def_bool TREE_PREEMPT_RCU
- help
- This option enables preemptible-RCU code that is common between
- the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
-
-config RCU_STALL_COMMON
- def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE )
- help
- This option enables RCU CPU stall code that is common between
- the TINY and TREE variants of RCU. The purpose is to allow
- the tiny variants to disable RCU CPU stall warnings, while
- making these warnings mandatory for the tree variants.
-
-config CONTEXT_TRACKING
- bool
-
-config RCU_USER_QS
- bool "Consider userspace as in RCU extended quiescent state"
- depends on HAVE_CONTEXT_TRACKING && SMP
- select CONTEXT_TRACKING
- help
- This option sets hooks on kernel / userspace boundaries and
- puts RCU in extended quiescent state when the CPU runs in
- userspace. It means that when a CPU runs in userspace, it is
- excluded from the global RCU state machine and thus doesn't
- try to keep the timer tick on for RCU.
-
- Unless you want to hack and help the development of the full
- dynticks mode, you shouldn't enable this option. It also
- adds unnecessary overhead.
-
- If unsure say N
-
-config CONTEXT_TRACKING_FORCE
- bool "Force context tracking"
- depends on CONTEXT_TRACKING
- default CONTEXT_TRACKING
- help
- Probe on user/kernel boundaries by default in order to
- test the features that rely on it such as userspace RCU extended
- quiescent states.
- This test is there for debugging until we have a real user like the
- full dynticks mode.
-
-config RCU_FANOUT
- int "Tree-based hierarchical RCU fanout value"
- range 2 64 if 64BIT
- range 2 32 if !64BIT
- depends on TREE_RCU || TREE_PREEMPT_RCU
- default 64 if 64BIT
- default 32 if !64BIT
- help
- This option controls the fanout of hierarchical implementations
- of RCU, allowing RCU to work efficiently on machines with
- large numbers of CPUs. This value must be at least the fourth
- root of NR_CPUS, which allows NR_CPUS to be insanely large.
- The default value of RCU_FANOUT should be used for production
- systems, but if you are stress-testing the RCU implementation
- itself, small RCU_FANOUT values allow you to test large-system
- code paths on small(er) systems.
-
- Select a specific number if testing RCU itself.
- Take the default if unsure.
-
-config RCU_FANOUT_LEAF
- int "Tree-based hierarchical RCU leaf-level fanout value"
- range 2 RCU_FANOUT if 64BIT
- range 2 RCU_FANOUT if !64BIT
- depends on TREE_RCU || TREE_PREEMPT_RCU
- default 16
- help
- This option controls the leaf-level fanout of hierarchical
- implementations of RCU, and allows trading off cache misses
- against lock contention. Systems that synchronize their
- scheduling-clock interrupts for energy-efficiency reasons will
- want the default because the smaller leaf-level fanout keeps
- lock contention levels acceptably low. Very large systems
- (hundreds or thousands of CPUs) will instead want to set this
- value to the maximum value possible in order to reduce the
- number of cache misses incurred during RCU's grace-period
- initialization. These systems tend to run CPU-bound, and thus
- are not helped by synchronized interrupts, and thus tend to
- skew them, which reduces lock contention enough that large
- leaf-level fanouts work well.
-
- Select a specific number if testing RCU itself.
-
- Select the maximum permissible value for large systems.
-
- Take the default if unsure.
-
-config RCU_FANOUT_EXACT
- bool "Disable tree-based hierarchical RCU auto-balancing"
- depends on TREE_RCU || TREE_PREEMPT_RCU
- default n
- help
- This option forces use of the exact RCU_FANOUT value specified,
- regardless of imbalances in the hierarchy. This is useful for
- testing RCU itself, and might one day be useful on systems with
- strong NUMA behavior.
+ In kernels with cgroup support, cgroups (cgroup2 only) will
+ have cpu.pressure, memory.pressure, and io.pressure files,
+ which aggregate pressure stalls for the grouped tasks only.
- Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
+ For more details see Documentation/accounting/psi.rst.
Say N if unsure.
-config RCU_FAST_NO_HZ
- bool "Accelerate last non-dyntick-idle CPU's grace periods"
- depends on NO_HZ_COMMON && SMP
+config PSI_DEFAULT_DISABLED
+ bool "Require boot parameter to enable pressure stall information tracking"
default n
+ depends on PSI
help
- This option permits CPUs to enter dynticks-idle state even if
- they have RCU callbacks queued, and prevents RCU from waking
- these CPUs up more than roughly once every four jiffies (by
- default, you can adjust this using the rcutree.rcu_idle_gp_delay
- parameter), thus improving energy efficiency. On the other
- hand, this option increases the duration of RCU grace periods,
- for example, slowing down synchronize_rcu().
+ If set, pressure stall information tracking will be disabled
+ per default but can be enabled through passing psi=1 on the
+ kernel commandline during boot.
- Say Y if energy efficiency is critically important, and you
- don't care about increased grace-period durations.
+ This feature adds some code to the task wakeup and sleep
+ paths of the scheduler. The overhead is too low to affect
+ common scheduling-intense workloads in practice (such as
+ webservers, memcache), but it does show up in artificial
+ scheduler stress tests, such as hackbench.
- Say N if you are unsure.
+ If you are paranoid and not sure what the kernel will be
+ used for, say Y.
-config TREE_RCU_TRACE
- def_bool RCU_TRACE && ( TREE_RCU || TREE_PREEMPT_RCU )
- select DEBUG_FS
- help
- This option provides tracing for the TREE_RCU and
- TREE_PREEMPT_RCU implementations, permitting Makefile to
- trivially select kernel/rcutree_trace.c.
-
-config RCU_BOOST
- bool "Enable RCU priority boosting"
- depends on RT_MUTEXES && PREEMPT_RCU
- default n
- help
- This option boosts the priority of preempted RCU readers that
- block the current preemptible RCU grace period for too long.
- This option also prevents heavy loads from blocking RCU
- callback invocation for all flavors of RCU.
-
- Say Y here if you are working with real-time apps or heavy loads
- Say N here if you are unsure.
-
-config RCU_BOOST_PRIO
- int "Real-time priority to boost RCU readers to"
- range 1 99
- depends on RCU_BOOST
- default 1
- help
- This option specifies the real-time priority to which long-term
- preempted RCU readers are to be boosted. If you are working
- with a real-time application that has one or more CPU-bound
- threads running at a real-time priority level, you should set
- RCU_BOOST_PRIO to a priority higher then the highest-priority
- real-time CPU-bound thread. The default RCU_BOOST_PRIO value
- of 1 is appropriate in the common case, which is real-time
- applications that do not have any CPU-bound threads.
-
- Some real-time applications might not have a single real-time
- thread that saturates a given CPU, but instead might have
- multiple real-time threads that, taken together, fully utilize
- that CPU. In this case, you should set RCU_BOOST_PRIO to
- a priority higher than the lowest-priority thread that is
- conspiring to prevent the CPU from running any non-real-time
- tasks. For example, if one thread at priority 10 and another
- thread at priority 5 are between themselves fully consuming
- the CPU time on a given CPU, then RCU_BOOST_PRIO should be
- set to priority 6 or higher.
-
- Specify the real-time priority, or take the default if unsure.
-
-config RCU_BOOST_DELAY
- int "Milliseconds to delay boosting after RCU grace-period start"
- range 0 3000
- depends on RCU_BOOST
- default 500
- help
- This option specifies the time to wait after the beginning of
- a given grace period before priority-boosting preempted RCU
- readers blocking that grace period. Note that any RCU reader
- blocking an expedited RCU grace period is boosted immediately.
-
- Accept the default if unsure.
-
-config RCU_NOCB_CPU
- bool "Offload RCU callback processing from boot-selected CPUs"
- depends on TREE_RCU || TREE_PREEMPT_RCU
- default n
- help
- Use this option to reduce OS jitter for aggressive HPC or
- real-time workloads. It can also be used to offload RCU
- callback invocation to energy-efficient CPUs in battery-powered
- asymmetric multiprocessors.
-
- This option offloads callback invocation from the set of
- CPUs specified at boot time by the rcu_nocbs parameter.
- For each such CPU, a kthread ("rcuox/N") will be created to
- invoke callbacks, where the "N" is the CPU being offloaded,
- and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and
- "s" for RCU-sched. Nothing prevents this kthread from running
- on the specified CPUs, but (1) the kthreads may be preempted
- between each callback, and (2) affinity or cgroups can be used
- to force the kthreads to run on whatever set of CPUs is desired.
+ Say N if unsure.
- Say Y here if you want to help to debug reduced OS jitter.
- Say N here if you are unsure.
+endmenu # "CPU/Task time and stats accounting"
-choice
- prompt "Build-forced no-CBs CPUs"
- default RCU_NOCB_CPU_NONE
- help
- This option allows no-CBs CPUs (whose RCU callbacks are invoked
- from kthreads rather than from softirq context) to be specified
- at build time. Additional no-CBs CPUs may be specified by
- the rcu_nocbs= boot parameter.
-
-config RCU_NOCB_CPU_NONE
- bool "No build_forced no-CBs CPUs"
- depends on RCU_NOCB_CPU && !NO_HZ_FULL
- help
- This option does not force any of the CPUs to be no-CBs CPUs.
- Only CPUs designated by the rcu_nocbs= boot parameter will be
- no-CBs CPUs, whose RCU callbacks will be invoked by per-CPU
- kthreads whose names begin with "rcuo". All other CPUs will
- invoke their own RCU callbacks in softirq context.
-
- Select this option if you want to choose no-CBs CPUs at
- boot time, for example, to allow testing of different no-CBs
- configurations without having to rebuild the kernel each time.
-
-config RCU_NOCB_CPU_ZERO
- bool "CPU 0 is a build_forced no-CBs CPU"
- depends on RCU_NOCB_CPU && !NO_HZ_FULL
- help
- This option forces CPU 0 to be a no-CBs CPU, so that its RCU
- callbacks are invoked by a per-CPU kthread whose name begins
- with "rcuo". Additional CPUs may be designated as no-CBs
- CPUs using the rcu_nocbs= boot parameter will be no-CBs CPUs.
- All other CPUs will invoke their own RCU callbacks in softirq
- context.
-
- Select this if CPU 0 needs to be a no-CBs CPU for real-time
- or energy-efficiency reasons, but the real reason it exists
- is to ensure that randconfig testing covers mixed systems.
-
-config RCU_NOCB_CPU_ALL
- bool "All CPUs are build_forced no-CBs CPUs"
- depends on RCU_NOCB_CPU
- help
- This option forces all CPUs to be no-CBs CPUs. The rcu_nocbs=
- boot parameter will be ignored. All CPUs' RCU callbacks will
- be executed in the context of per-CPU rcuo kthreads created for
- this purpose. Assuming that the kthreads whose names start with
- "rcuo" are bound to "housekeeping" CPUs, this reduces OS jitter
- on the remaining CPUs, but might decrease memory locality during
- RCU-callback invocation, thus potentially degrading throughput.
-
- Select this if all CPUs need to be no-CBs CPUs for real-time
- or energy-efficiency reasons.
+config CPU_ISOLATION
+ bool "CPU isolation"
+ depends on SMP
+ default y
+ help
+ Make sure that CPUs running critical tasks are not disturbed by
+ any source of "noise" such as unbound workqueues, timers, kthreads...
+ Unbound jobs get offloaded to housekeeping CPUs. This is driven by
+ the "isolcpus=" boot parameter.
-endchoice
+ Say Y if unsure.
-endmenu # "RCU Subsystem"
+source "kernel/rcu/Kconfig"
config IKCONFIG
tristate "Kernel .config support"
- ---help---
+ help
This option enables the complete Linux kernel ".config" file
contents to be saved in the kernel. It provides documentation
of which kernel options are used in a running kernel or in an
@@ -771,24 +764,88 @@ config IKCONFIG
config IKCONFIG_PROC
bool "Enable access to .config through /proc/config.gz"
depends on IKCONFIG && PROC_FS
- ---help---
+ help
This option enables access to the kernel configuration file
through /proc/config.gz.
+config IKHEADERS
+ tristate "Enable kernel headers through /sys/kernel/kheaders.tar.xz"
+ depends on SYSFS
+ help
+ This option enables access to the in-kernel headers that are generated during
+ the build process. These can be used to build eBPF tracing programs,
+ or similar programs. If you build the headers as a module, a module called
+ kheaders.ko is built which can be loaded on-demand to get access to headers.
+
config LOG_BUF_SHIFT
int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
- range 12 21
+ range 12 25
default 17
+ depends on PRINTK
help
- Select kernel log buffer size as a power of 2.
+ Select the minimal kernel log buffer size as a power of 2.
+ The final size is affected by LOG_CPU_MAX_BUF_SHIFT config
+ parameter, see below. Any higher size also might be forced
+ by "log_buf_len" boot parameter.
+
Examples:
- 17 => 128 KB
+ 17 => 128 KB
16 => 64 KB
- 15 => 32 KB
- 14 => 16 KB
+ 15 => 32 KB
+ 14 => 16 KB
13 => 8 KB
12 => 4 KB
+config LOG_CPU_MAX_BUF_SHIFT
+ int "CPU kernel log buffer size contribution (13 => 8 KB, 17 => 128KB)"
+ depends on SMP
+ range 0 21
+ default 0 if BASE_SMALL
+ default 12
+ depends on PRINTK
+ help
+ This option allows to increase the default ring buffer size
+ according to the number of CPUs. The value defines the contribution
+ of each CPU as a power of 2. The used space is typically only few
+ lines however it might be much more when problems are reported,
+ e.g. backtraces.
+
+ The increased size means that a new buffer has to be allocated and
+ the original static one is unused. It makes sense only on systems
+ with more CPUs. Therefore this value is used only when the sum of
+ contributions is greater than the half of the default kernel ring
+ buffer as defined by LOG_BUF_SHIFT. The default values are set
+ so that more than 16 CPUs are needed to trigger the allocation.
+
+ Also this option is ignored when "log_buf_len" kernel parameter is
+ used as it forces an exact (power of two) size of the ring buffer.
+
+ The number of possible CPUs is used for this computation ignoring
+ hotplugging making the computation optimal for the worst case
+ scenario while allowing a simple algorithm to be used from bootup.
+
+ Examples shift values and their meaning:
+ 17 => 128 KB for each CPU
+ 16 => 64 KB for each CPU
+ 15 => 32 KB for each CPU
+ 14 => 16 KB for each CPU
+ 13 => 8 KB for each CPU
+ 12 => 4 KB for each CPU
+
+config PRINTK_INDEX
+ bool "Printk indexing debugfs interface"
+ depends on PRINTK && DEBUG_FS
+ help
+ Add support for indexing of all printk formats known at compile time
+ at <debugfs>/printk/index/<module>.
+
+ This can be used as part of maintaining daemons which monitor
+ /dev/kmsg, as it permits auditing the printk formats present in a
+ kernel, allowing detection of cases where monitored printks are
+ changed or no longer present.
+
+ There is no additional runtime cost to printk with this enabled.
+
#
# Architectures with an unreliable sched_clock() should select this:
#
@@ -798,6 +855,71 @@ config HAVE_UNSTABLE_SCHED_CLOCK
config GENERIC_SCHED_CLOCK
bool
+menu "Scheduler features"
+
+config UCLAMP_TASK
+ bool "Enable utilization clamping for RT/FAIR tasks"
+ depends on CPU_FREQ_GOV_SCHEDUTIL
+ help
+ This feature enables the scheduler to track the clamped utilization
+ of each CPU based on RUNNABLE tasks scheduled on that CPU.
+
+ With this option, the user can specify the min and max CPU
+ utilization allowed for RUNNABLE tasks. The max utilization defines
+ the maximum frequency a task should use while the min utilization
+ defines the minimum frequency it should use.
+
+ Both min and max utilization clamp values are hints to the scheduler,
+ aiming at improving its frequency selection policy, but they do not
+ enforce or grant any specific bandwidth for tasks.
+
+ If in doubt, say N.
+
+config UCLAMP_BUCKETS_COUNT
+ int "Number of supported utilization clamp buckets"
+ range 5 20
+ default 5
+ depends on UCLAMP_TASK
+ help
+ Defines the number of clamp buckets to use. The range of each bucket
+ will be SCHED_CAPACITY_SCALE/UCLAMP_BUCKETS_COUNT. The higher the
+ number of clamp buckets the finer their granularity and the higher
+ the precision of clamping aggregation and tracking at run-time.
+
+ For example, with the minimum configuration value we will have 5
+ clamp buckets tracking 20% utilization each. A 25% boosted tasks will
+ be refcounted in the [20..39]% bucket and will set the bucket clamp
+ effective value to 25%.
+ If a second 30% boosted task should be co-scheduled on the same CPU,
+ that task will be refcounted in the same bucket of the first task and
+ it will boost the bucket clamp effective value to 30%.
+ The clamp effective value of a bucket is reset to its nominal value
+ (20% in the example above) when there are no more tasks refcounted in
+ that bucket.
+
+ An additional boost/capping margin can be added to some tasks. In the
+ example above the 25% task will be boosted to 30% until it exits the
+ CPU. If that should be considered not acceptable on certain systems,
+ it's always possible to reduce the margin by increasing the number of
+ clamp buckets to trade off used memory for run-time tracking
+ precision.
+
+ If in doubt, use the default value.
+
+config SCHED_PROXY_EXEC
+ bool "Proxy Execution"
+ # Avoid some build failures w/ PREEMPT_RT until it can be fixed
+ depends on !PREEMPT_RT
+ # Need to investigate how to inform sched_ext of split contexts
+ depends on !SCHED_CLASS_EXT
+ # Not particularly useful until we get to multi-rq proxying
+ depends on EXPERT
+ help
+ This option enables proxy execution, a mechanism for mutex-owning
+ tasks to inherit the scheduling context of higher priority waiters.
+
+endmenu
+
#
# For architectures that want to enable the support for NUMA-affine scheduler
# balancing logic:
@@ -805,200 +927,178 @@ config GENERIC_SCHED_CLOCK
config ARCH_SUPPORTS_NUMA_BALANCING
bool
-# For architectures that (ab)use NUMA to represent different memory regions
-# all cpu-local but of different latencies, such as SuperH.
#
-config ARCH_WANT_NUMA_VARIABLE_LOCALITY
+# For architectures that prefer to flush all TLBs after a number of pages
+# are unmapped instead of sending one IPI per page to flush. The architecture
+# must provide guarantees on what happens if a clean TLB cache entry is
+# written after the unmap. Details are in mm/rmap.c near the check for
+# should_defer_flush. The architecture should also consider if the full flush
+# and the refill costs are offset by the savings of sending fewer IPIs.
+config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
bool
-#
-# For architectures that are willing to define _PAGE_NUMA as _PAGE_PROTNONE
-config ARCH_WANTS_PROT_NUMA_PROT_NONE
+config CC_HAS_INT128
+ def_bool !$(cc-option,$(m64-flag) -D__SIZEOF_INT128__=0) && 64BIT
+
+config CC_IMPLICIT_FALLTHROUGH
+ string
+ default "-Wimplicit-fallthrough=5" if CC_IS_GCC && $(cc-option,-Wimplicit-fallthrough=5)
+ default "-Wimplicit-fallthrough" if CC_IS_CLANG && $(cc-option,-Wunreachable-code-fallthrough)
+
+# Currently, disable gcc-10+ array-bounds globally.
+# It's still broken in gcc-13, so no upper bound yet.
+config GCC10_NO_ARRAY_BOUNDS
+ def_bool y
+
+config CC_NO_ARRAY_BOUNDS
bool
+ default y if CC_IS_GCC && GCC_VERSION >= 90000 && GCC10_NO_ARRAY_BOUNDS
+
+# Currently, disable -Wstringop-overflow for GCC globally.
+config GCC_NO_STRINGOP_OVERFLOW
+ def_bool y
-config ARCH_USES_NUMA_PROT_NONE
+config CC_NO_STRINGOP_OVERFLOW
bool
- default y
- depends on ARCH_WANTS_PROT_NUMA_PROT_NONE
- depends on NUMA_BALANCING
+ default y if CC_IS_GCC && GCC_NO_STRINGOP_OVERFLOW
-config NUMA_BALANCING_DEFAULT_ENABLED
- bool "Automatically enable NUMA aware memory/task placement"
- default y
- depends on NUMA_BALANCING
- help
- If set, autonumic NUMA balancing will be enabled if running on a NUMA
- machine.
+config CC_STRINGOP_OVERFLOW
+ bool
+ default y if CC_IS_GCC && !CC_NO_STRINGOP_OVERFLOW
+
+#
+# For architectures that know their GCC __int128 support is sound
+#
+config ARCH_SUPPORTS_INT128
+ bool
+
+# For architectures that (ab)use NUMA to represent different memory regions
+# all cpu-local but of different latencies, such as SuperH.
+#
+config ARCH_WANT_NUMA_VARIABLE_LOCALITY
+ bool
config NUMA_BALANCING
bool "Memory placement aware NUMA scheduler"
depends on ARCH_SUPPORTS_NUMA_BALANCING
depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
- depends on SMP && NUMA && MIGRATION
+ depends on SMP && NUMA && MIGRATION && !PREEMPT_RT
help
This option adds support for automatic NUMA aware memory/task placement.
The mechanism is quite primitive and is based on migrating memory when
- it is references to the node the task is running on.
+ it has references to the node the task is running on.
This system will be inactive on UMA systems.
+config NUMA_BALANCING_DEFAULT_ENABLED
+ bool "Automatically enable NUMA aware memory/task placement"
+ default y
+ depends on NUMA_BALANCING
+ help
+ If set, automatic NUMA balancing will be enabled if running on a NUMA
+ machine.
+
+config SLAB_OBJ_EXT
+ bool
+
menuconfig CGROUPS
- boolean "Control Group support"
- depends on EVENTFD
+ bool "Control Group support"
+ select KERNFS
help
This option adds support for grouping sets of processes together, for
use with process control subsystems such as Cpusets, CFS, memory
controls or device isolation.
See
- - Documentation/scheduler/sched-design-CFS.txt (CFS)
- - Documentation/cgroups/ (features for grouping, isolation
+ - Documentation/scheduler/sched-design-CFS.rst (CFS)
+ - Documentation/admin-guide/cgroup-v1/ (features for grouping, isolation
and resource control)
Say N if unsure.
if CGROUPS
-config CGROUP_DEBUG
- bool "Example debug cgroup subsystem"
- default n
- help
- This option enables a simple cgroup subsystem that
- exports useful debugging information about the cgroups
- framework.
-
- Say N if unsure.
+config PAGE_COUNTER
+ bool
-config CGROUP_FREEZER
- bool "Freezer cgroup subsystem"
- help
- Provides a way to freeze and unfreeze all tasks in a
- cgroup.
+config CGROUP_FAVOR_DYNMODS
+ bool "Favor dynamic modification latency reduction by default"
+ help
+ This option enables the "favordynmods" mount option by default
+ which reduces the latencies of dynamic cgroup modifications such
+ as task migrations and controller on/offs at the cost of making
+ hot path operations such as forks and exits more expensive.
-config CGROUP_DEVICE
- bool "Device controller for cgroups"
- help
- Provides a cgroup implementing whitelists for devices which
- a process in the cgroup can mknod or open.
+ Say N if unsure.
-config CPUSETS
- bool "Cpuset support"
+config MEMCG
+ bool "Memory controller"
+ select PAGE_COUNTER
+ select EVENTFD
+ select SLAB_OBJ_EXT
+ select VM_EVENT_COUNTERS
help
- This option will let you create and manage CPUSETs which
- allow dynamically partitioning a system into sets of CPUs and
- Memory Nodes and assigning tasks to run only within those sets.
- This is primarily useful on large SMP or NUMA systems.
+ Provides control over the memory footprint of tasks in a cgroup.
- Say N if unsure.
+config MEMCG_NMI_UNSAFE
+ bool
+ depends on MEMCG
+ depends on HAVE_NMI
+ depends on !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !ARCH_HAVE_NMI_SAFE_CMPXCHG
+ default y
-config PROC_PID_CPUSET
- bool "Include legacy /proc/<pid>/cpuset file"
- depends on CPUSETS
+config MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+ bool
+ depends on MEMCG
+ depends on HAVE_NMI
+ depends on !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && ARCH_HAVE_NMI_SAFE_CMPXCHG
default y
-config CGROUP_CPUACCT
- bool "Simple CPU accounting cgroup subsystem"
+config MEMCG_V1
+ bool "Legacy cgroup v1 memory controller"
+ depends on MEMCG
+ default n
help
- Provides a simple Resource Controller for monitoring the
- total CPU consumed by the tasks in a cgroup.
+ Legacy cgroup v1 memory controller which has been deprecated by
+ cgroup v2 implementation. The v1 is there for legacy applications
+ which haven't migrated to the new cgroup v2 interface yet. If you
+ do not have any such application then you are completely fine leaving
+ this option disabled.
-config RESOURCE_COUNTERS
- bool "Resource counters"
- help
- This option enables controller independent resource accounting
- infrastructure that works with cgroups.
+ Please note that feature set of the legacy memory controller is likely
+ going to shrink due to deprecation process. New deployments with v1
+ controller are highly discouraged.
-config MEMCG
- bool "Memory Resource Controller for Control Groups"
- depends on RESOURCE_COUNTERS
- select MM_OWNER
- help
- Provides a memory resource controller that manages both anonymous
- memory and page cache. (See Documentation/cgroups/memory.txt)
-
- Note that setting this option increases fixed memory overhead
- associated with each page of memory in the system. By this,
- 8(16)bytes/PAGE_SIZE on 32(64)bit system will be occupied by memory
- usage tracking struct at boot. Total amount of this is printed out
- at boot.
-
- Only enable when you're ok with these trade offs and really
- sure you need the memory resource controller. Even when you enable
- this, you can set "cgroup_disable=memory" at your boot option to
- disable memory resource controller and you can avoid overheads.
- (and lose benefits of memory resource controller)
-
- This config option also selects MM_OWNER config option, which
- could in turn add some fork/exit overhead.
-
-config MEMCG_SWAP
- bool "Memory Resource Controller Swap Extension"
- depends on MEMCG && SWAP
- help
- Add swap management feature to memory resource controller. When you
- enable this, you can limit mem+swap usage per cgroup. In other words,
- when you disable this, memory resource controller has no cares to
- usage of swap...a process can exhaust all of the swap. This extension
- is useful when you want to avoid exhaustion swap but this itself
- adds more overheads and consumes memory for remembering information.
- Especially if you use 32bit system or small memory system, please
- be careful about enabling this. When memory resource controller
- is disabled by boot option, this will be automatically disabled and
- there will be no overhead from this. Even when you set this config=y,
- if boot option "swapaccount=0" is set, swap will not be accounted.
- Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
- size is 4096bytes, 512k per 1Gbytes of swap.
-config MEMCG_SWAP_ENABLED
- bool "Memory Resource Controller Swap Extension enabled by default"
- depends on MEMCG_SWAP
- default y
- help
- Memory Resource Controller Swap Extension comes with its price in
- a bigger memory consumption. General purpose distribution kernels
- which want to enable the feature but keep it disabled by default
- and let the user enable it by swapaccount boot command line
- parameter should have this option unselected.
- For those who want to have the feature enabled by default should
- select this option (if, for some reason, they need to disable it
- then swapaccount=0 does the trick).
-config MEMCG_KMEM
- bool "Memory Resource Controller Kernel Memory accounting"
- depends on MEMCG
- depends on SLUB || SLAB
- help
- The Kernel Memory extension for Memory Resource Controller can limit
- the amount of memory used by kernel objects in the system. Those are
- fundamentally different from the entities handled by the standard
- Memory Controller, which are page-based, and can be swapped. Users of
- the kmem extension can use it to guarantee that no group of processes
- will ever exhaust kernel resources alone.
+ Say N if unsure.
-config CGROUP_HUGETLB
- bool "HugeTLB Resource Controller for Control Groups"
- depends on RESOURCE_COUNTERS && HUGETLB_PAGE
+config BLK_CGROUP
+ bool "IO controller"
+ depends on BLOCK
default n
help
- Provides a cgroup Resource Controller for HugeTLB pages.
- When you enable this, you can put a per cgroup limit on HugeTLB usage.
- The limit is enforced during page fault. Since HugeTLB doesn't
- support page reclaim, enforcing the limit at page fault time implies
- that, the application will get SIGBUS signal if it tries to access
- HugeTLB pages beyond its limit. This requires the application to know
- beforehand how much HugeTLB pages it would require for its use. The
- control group is tracked in the third page lru pointer. This means
- that we cannot use the controller with huge page less than 3 pages.
+ Generic block IO controller cgroup interface. This is the common
+ cgroup interface which should be used by various IO controlling
+ policies.
-config CGROUP_PERF
- bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
- depends on PERF_EVENTS && CGROUPS
- help
- This option extends the per-cpu mode to restrict monitoring to
- threads which belong to the cgroup specified and run on the
- designated cpu.
+ Currently, CFQ IO scheduler uses it to recognize task groups and
+ control disk bandwidth allocation (proportional time slice allocation)
+ to such task groups. It is also used by bio throttling logic in
+ block layer to implement upper limit in IO rates on a device.
- Say N if unsure.
+ This option only enables generic Block IO controller infrastructure.
+ One needs to also enable actual IO controlling logic/policy. For
+ enabling proportional weight division of disk bandwidth in CFQ, set
+ CONFIG_BFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
+ CONFIG_BLK_DEV_THROTTLING=y.
+
+ See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information.
+
+config CGROUP_WRITEBACK
+ bool
+ depends on MEMCG && BLK_CGROUP
+ default y
menuconfig CGROUP_SCHED
- bool "Group CPU scheduler"
+ bool "CPU controller"
default n
help
This feature lets CPU scheduler recognize task groups and control CPU
@@ -1006,21 +1106,29 @@ menuconfig CGROUP_SCHED
tasks.
if CGROUP_SCHED
+config GROUP_SCHED_WEIGHT
+ def_bool n
+
+config GROUP_SCHED_BANDWIDTH
+ def_bool n
+
config FAIR_GROUP_SCHED
bool "Group scheduling for SCHED_OTHER"
depends on CGROUP_SCHED
+ select GROUP_SCHED_WEIGHT
default CGROUP_SCHED
config CFS_BANDWIDTH
bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
depends on FAIR_GROUP_SCHED
+ select GROUP_SCHED_BANDWIDTH
default n
help
This option allows users to define CPU bandwidth rates (limits) for
tasks running within the fair group scheduler. Groups with no limit
set are considered to be unconstrained and will run with no
restriction.
- See tip/Documentation/scheduler/sched-bwc.txt for more information.
+ See Documentation/scheduler/sched-bwc.rst for more information.
config RT_GROUP_SCHED
bool "Group scheduling for SCHED_RR/FIFO"
@@ -1031,55 +1139,219 @@ config RT_GROUP_SCHED
to task groups. If enabled, it will also make it impossible to
schedule realtime tasks for non-root users until you allocate
realtime bandwidth for them.
- See Documentation/scheduler/sched-rt-group.txt for more information.
+ See Documentation/scheduler/sched-rt-group.rst for more information.
+
+config RT_GROUP_SCHED_DEFAULT_DISABLED
+ bool "Require boot parameter to enable group scheduling for SCHED_RR/FIFO"
+ depends on RT_GROUP_SCHED
+ default n
+ help
+ When set, the RT group scheduling is disabled by default. The option
+ is in inverted form so that mere RT_GROUP_SCHED enables the group
+ scheduling.
+
+ Say N if unsure.
+
+config EXT_GROUP_SCHED
+ bool
+ depends on SCHED_CLASS_EXT && CGROUP_SCHED
+ select GROUP_SCHED_WEIGHT
+ select GROUP_SCHED_BANDWIDTH
+ default y
endif #CGROUP_SCHED
-config BLK_CGROUP
- bool "Block IO controller"
- depends on BLOCK
+config SCHED_MM_CID
+ def_bool y
+ depends on SMP && RSEQ
+
+config UCLAMP_TASK_GROUP
+ bool "Utilization clamping per group of tasks"
+ depends on CGROUP_SCHED
+ depends on UCLAMP_TASK
default n
- ---help---
- Generic block IO controller cgroup interface. This is the common
- cgroup interface which should be used by various IO controlling
- policies.
+ help
+ This feature enables the scheduler to track the clamped utilization
+ of each CPU based on RUNNABLE tasks currently scheduled on that CPU.
- Currently, CFQ IO scheduler uses it to recognize task groups and
- control disk bandwidth allocation (proportional time slice allocation)
- to such task groups. It is also used by bio throttling logic in
- block layer to implement upper limit in IO rates on a device.
+ When this option is enabled, the user can specify a min and max
+ CPU bandwidth which is allowed for each single task in a group.
+ The max bandwidth allows to clamp the maximum frequency a task
+ can use, while the min bandwidth allows to define a minimum
+ frequency a task will always use.
- This option only enables generic Block IO controller infrastructure.
- One needs to also enable actual IO controlling logic/policy. For
- enabling proportional weight division of disk bandwidth in CFQ, set
- CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
- CONFIG_BLK_DEV_THROTTLING=y.
+ When task group based utilization clamping is enabled, an eventually
+ specified task-specific clamp value is constrained by the cgroup
+ specified clamp value. Both minimum and maximum task clamping cannot
+ be bigger than the corresponding clamping defined at task group level.
+
+ If in doubt, say N.
+
+config CGROUP_PIDS
+ bool "PIDs controller"
+ help
+ Provides enforcement of process number limits in the scope of a
+ cgroup. Any attempt to fork more processes than is allowed in the
+ cgroup will fail. PIDs are fundamentally a global resource because it
+ is fairly trivial to reach PID exhaustion before you reach even a
+ conservative kmemcg limit. As a result, it is possible to grind a
+ system to halt without being limited by other cgroup policies. The
+ PIDs controller is designed to stop this from happening.
- See Documentation/cgroups/blkio-controller.txt for more information.
+ It should be noted that organisational operations (such as attaching
+ to a cgroup hierarchy) will *not* be blocked by the PIDs controller,
+ since the PIDs limit only affects a process's ability to fork, not to
+ attach to a cgroup.
+
+config CGROUP_RDMA
+ bool "RDMA controller"
+ help
+ Provides enforcement of RDMA resources defined by IB stack.
+ It is fairly easy for consumers to exhaust RDMA resources, which
+ can result into resource unavailability to other consumers.
+ RDMA controller is designed to stop this from happening.
+ Attaching processes with active RDMA resources to the cgroup
+ hierarchy is allowed even if can cross the hierarchy's limit.
+
+config CGROUP_DMEM
+ bool "Device memory controller (DMEM)"
+ select PAGE_COUNTER
+ help
+ The DMEM controller allows compatible devices to restrict device
+ memory usage based on the cgroup hierarchy.
+
+ As an example, it allows you to restrict VRAM usage for applications
+ in the DRM subsystem.
+
+config CGROUP_FREEZER
+ bool "Freezer controller"
+ help
+ Provides a way to freeze and unfreeze all tasks in a
+ cgroup.
-config DEBUG_BLK_CGROUP
- bool "Enable Block IO controller debugging"
- depends on BLK_CGROUP
+ This option affects the ORIGINAL cgroup interface. The cgroup2 memory
+ controller includes important in-kernel memory consumers per default.
+
+ If you're using cgroup2, say N.
+
+config CGROUP_HUGETLB
+ bool "HugeTLB controller"
+ depends on HUGETLB_PAGE
+ select PAGE_COUNTER
default n
- ---help---
- Enable some debugging help. Currently it exports additional stat
- files in a cgroup which can be useful for debugging.
+ help
+ Provides a cgroup controller for HugeTLB pages.
+ When you enable this, you can put a per cgroup limit on HugeTLB usage.
+ The limit is enforced during page fault. Since HugeTLB doesn't
+ support page reclaim, enforcing the limit at page fault time implies
+ that, the application will get SIGBUS signal if it tries to access
+ HugeTLB pages beyond its limit. This requires the application to know
+ beforehand how much HugeTLB pages it would require for its use. The
+ control group is tracked in the third page lru pointer. This means
+ that we cannot use the controller with huge page less than 3 pages.
-endif # CGROUPS
+config CPUSETS
+ bool "Cpuset controller"
+ depends on SMP
+ select UNION_FIND
+ help
+ This option will let you create and manage CPUSETs which
+ allow dynamically partitioning a system into sets of CPUs and
+ Memory Nodes and assigning tasks to run only within those sets.
+ This is primarily useful on large SMP or NUMA systems.
-config CHECKPOINT_RESTORE
- bool "Checkpoint/restore support" if EXPERT
+ Say N if unsure.
+
+config CPUSETS_V1
+ bool "Legacy cgroup v1 cpusets controller"
+ depends on CPUSETS
default n
help
- Enables additional kernel features in a sake of checkpoint/restore.
- In particular it adds auxiliary prctl codes to setup process text,
- data and heap segment sizes, and a few additional /proc filesystem
- entries.
+ Legacy cgroup v1 cpusets controller which has been deprecated by
+ cgroup v2 implementation. The v1 is there for legacy applications
+ which haven't migrated to the new cgroup v2 interface yet. Legacy
+ interface includes cpuset filesystem and /proc/<pid>/cpuset. If you
+ do not have any such application then you are completely fine leaving
+ this option disabled.
- If unsure, say N here.
+ Say N if unsure.
+
+config PROC_PID_CPUSET
+ bool "Include legacy /proc/<pid>/cpuset file"
+ depends on CPUSETS_V1
+ default y
+
+config CGROUP_DEVICE
+ bool "Device controller"
+ help
+ Provides a cgroup controller implementing whitelists for
+ devices which a process in the cgroup can mknod or open.
+
+config CGROUP_CPUACCT
+ bool "Simple CPU accounting controller"
+ help
+ Provides a simple controller for monitoring the
+ total CPU consumed by the tasks in a cgroup.
+
+config CGROUP_PERF
+ bool "Perf controller"
+ depends on PERF_EVENTS
+ help
+ This option extends the perf per-cpu mode to restrict monitoring
+ to threads which belong to the cgroup specified and run on the
+ designated cpu. Or this can be used to have cgroup ID in samples
+ so that it can monitor performance events among cgroups.
+
+ Say N if unsure.
+
+config CGROUP_BPF
+ bool "Support for eBPF programs attached to cgroups"
+ depends on BPF_SYSCALL
+ select SOCK_CGROUP_DATA
+ help
+ Allow attaching eBPF programs to a cgroup using the bpf(2)
+ syscall command BPF_PROG_ATTACH.
+
+ In which context these programs are accessed depends on the type
+ of attachment. For instance, programs that are attached using
+ BPF_CGROUP_INET_INGRESS will be executed on the ingress path of
+ inet sockets.
+
+config CGROUP_MISC
+ bool "Misc resource controller"
+ default n
+ help
+ Provides a controller for miscellaneous resources on a host.
+
+ Miscellaneous scalar resources are the resources on the host system
+ which cannot be abstracted like the other cgroups. This controller
+ tracks and limits the miscellaneous resources used by a process
+ attached to a cgroup hierarchy.
+
+ For more information, please check misc cgroup section in
+ /Documentation/admin-guide/cgroup-v2.rst.
+
+config CGROUP_DEBUG
+ bool "Debug controller"
+ default n
+ depends on DEBUG_KERNEL
+ help
+ This option enables a simple controller that exports
+ debugging information about the cgroups framework. This
+ controller is for control cgroup debugging only. Its
+ interfaces are not stable.
+
+ Say N.
+
+config SOCK_CGROUP_DATA
+ bool
+ default n
+
+endif # CGROUPS
menuconfig NAMESPACES
bool "Namespaces support" if EXPERT
+ depends on MULTIUSER
default !EXPERT
help
Provides the way to make tasks work with different objects using
@@ -1096,6 +1368,14 @@ config UTS_NS
In this namespace tasks see different info provided with the
uname() system call
+config TIME_NS
+ bool "TIME namespace"
+ depends on GENERIC_GETTIMEOFDAY
+ default y
+ help
+ In this namespace boottime and monotonic clocks can be set.
+ The time will keep going with the same pace.
+
config IPC_NS
bool "IPC namespace"
depends on (SYSVIPC || POSIX_MQUEUE)
@@ -1106,19 +1386,15 @@ config IPC_NS
config USER_NS
bool "User namespace"
- depends on UIDGID_CONVERTED
- select UIDGID_STRICT_TYPE_CHECKS
-
default n
help
This allows containers, i.e. vservers, to use user namespaces
to provide different user info for different servers.
When user namespaces are enabled in the kernel it is
- recommended that the MEMCG and MEMCG_KMEM options also be
- enabled and that user-space use the memory control groups to
- limit the amount of memory a memory unprivileged users can
- use.
+ recommended that the MEMCG option also be enabled and that
+ user-space use the memory control groups to limit the amount
+ of memory a memory unprivileged users can use.
If unsure, say N.
@@ -1140,30 +1416,22 @@ config NET_NS
endif # NAMESPACES
-config UIDGID_CONVERTED
- # True if all of the selected software conmponents are known
- # to have uid_t and gid_t converted to kuid_t and kgid_t
- # where appropriate and are otherwise safe to use with
- # the user namespace.
- bool
- default y
-
- # Filesystems
- depends on XFS_FS = n
-
-config UIDGID_STRICT_TYPE_CHECKS
- bool "Require conversions between uid/gids and their internal representation"
- depends on UIDGID_CONVERTED
+config CHECKPOINT_RESTORE
+ bool "Checkpoint/restore support"
+ depends on PROC_FS
+ select PROC_CHILDREN
+ select KCMP
default n
help
- While the nececessary conversions are being added to all subsystems this option allows
- the code to continue to build for unconverted subsystems.
+ Enables additional kernel features in a sake of checkpoint/restore.
+ In particular it adds auxiliary prctl codes to setup process text,
+ data and heap segment sizes, and a few additional /proc filesystem
+ entries.
- Say Y here if you want the strict type checking enabled
+ If unsure, say N here.
config SCHED_AUTOGROUP
bool "Automatic process group scheduling"
- select EVENTFD
select CGROUPS
select CGROUP_SCHED
select FAIR_GROUP_SCHED
@@ -1174,49 +1442,9 @@ config SCHED_AUTOGROUP
desktop applications. Task group autogeneration is currently based
upon task session.
-config MM_OWNER
- bool
-
-config SYSFS_DEPRECATED
- bool "Enable deprecated sysfs features to support old userspace tools"
- depends on SYSFS
- default n
- help
- This option adds code that switches the layout of the "block" class
- devices, to not show up in /sys/class/block/, but only in
- /sys/block/.
-
- This switch is only active when the sysfs.deprecated=1 boot option is
- passed or the SYSFS_DEPRECATED_V2 option is set.
-
- This option allows new kernels to run on old distributions and tools,
- which might get confused by /sys/class/block/. Since 2007/2008 all
- major distributions and tools handle this just fine.
-
- Recent distributions and userspace tools after 2009/2010 depend on
- the existence of /sys/class/block/, and will not work with this
- option enabled.
-
- Only if you are using a new kernel on an old distribution, you might
- need to say Y here.
-
-config SYSFS_DEPRECATED_V2
- bool "Enable deprecated sysfs features by default"
- default n
- depends on SYSFS
- depends on SYSFS_DEPRECATED
- help
- Enable deprecated sysfs by default.
-
- See the CONFIG_SYSFS_DEPRECATED option for more details about this
- option.
-
- Only if you are using a new kernel on an old distribution, you might
- need to say Y here. Even then, odds are you would not need it
- enabled, you can always pass the boot option if absolutely necessary.
-
config RELAY
bool "Kernel->user space relay support (formerly relayfs)"
+ select IRQ_WORK
help
This option enables support for relay interface support in
certain file systems (such as debugfs).
@@ -1228,13 +1456,12 @@ config RELAY
config BLK_DEV_INITRD
bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support"
- depends on BROKEN || !FRV
help
The initial RAM filesystem is a ramfs which is loaded by the
boot loader (loadlin or lilo) and that is mounted as root
before the normal boot procedure. It is typically used to
load modules needed to mount the "real" root file system,
- etc. See <file:Documentation/initrd.txt> for details.
+ etc. See <file:Documentation/admin-guide/initrd.rst> for details.
If RAM disk support (BLK_DEV_RAM) is also included, this
also enables initial RAM disk (initrd) support and adds
@@ -1248,18 +1475,146 @@ source "usr/Kconfig"
endif
-config CC_OPTIMIZE_FOR_SIZE
- bool "Optimize for size"
+config BOOT_CONFIG
+ bool "Boot config support"
+ select BLK_DEV_INITRD if !BOOT_CONFIG_EMBED
help
- Enabling this option will pass "-Os" instead of "-O2" to gcc
- resulting in a smaller kernel.
+ Extra boot config allows system admin to pass a config file as
+ complemental extension of kernel cmdline when booting.
+ The boot config file must be attached at the end of initramfs
+ with checksum, size and magic word.
+ See <file:Documentation/admin-guide/bootconfig.rst> for details.
+
+ If unsure, say Y.
+
+config BOOT_CONFIG_FORCE
+ bool "Force unconditional bootconfig processing"
+ depends on BOOT_CONFIG
+ default y if BOOT_CONFIG_EMBED
+ help
+ With this Kconfig option set, BOOT_CONFIG processing is carried
+ out even when the "bootconfig" kernel-boot parameter is omitted.
+ In fact, with this Kconfig option set, there is no way to
+ make the kernel ignore the BOOT_CONFIG-supplied kernel-boot
+ parameters.
If unsure, say N.
-config SYSCTL
+config BOOT_CONFIG_EMBED
+ bool "Embed bootconfig file in the kernel"
+ depends on BOOT_CONFIG
+ help
+ Embed a bootconfig file given by BOOT_CONFIG_EMBED_FILE in the
+ kernel. Usually, the bootconfig file is loaded with the initrd
+ image. But if the system doesn't support initrd, this option will
+ help you by embedding a bootconfig file while building the kernel.
+
+ If unsure, say N.
+
+config BOOT_CONFIG_EMBED_FILE
+ string "Embedded bootconfig file path"
+ depends on BOOT_CONFIG_EMBED
+ help
+ Specify a bootconfig file which will be embedded to the kernel.
+ This bootconfig will be used if there is no initrd or no other
+ bootconfig in the initrd.
+
+config CMDLINE_LOG_WRAP_IDEAL_LEN
+ int "Length to try to wrap the cmdline when logged at boot"
+ default 1021
+ range 0 1021
+ help
+ At boot time, the kernel command line is logged to the console.
+ The log message will start with the prefix "Kernel command line: ".
+ The log message will attempt to be wrapped (split into multiple log
+ messages) at spaces based on CMDLINE_LOG_WRAP_IDEAL_LEN characters.
+ If wrapping happens, each log message will start with the prefix and
+ all but the last message will end with " \". Messages may exceed the
+ ideal length if a place to wrap isn't found before the specified
+ number of characters.
+
+ A value of 0 disables wrapping, though be warned that the maximum
+ length of a log message (1021 characters) may cause the cmdline to
+ be truncated.
+
+config INITRAMFS_PRESERVE_MTIME
+ bool "Preserve cpio archive mtimes in initramfs"
+ depends on BLK_DEV_INITRD
+ default y
+ help
+ Each entry in an initramfs cpio archive carries an mtime value. When
+ enabled, extracted cpio items take this mtime, with directory mtime
+ setting deferred until after creation of any child entries.
+
+ If unsure, say Y.
+
+config INITRAMFS_TEST
+ bool "Test initramfs cpio archive extraction" if !KUNIT_ALL_TESTS
+ depends on BLK_DEV_INITRD && KUNIT=y
+ default KUNIT_ALL_TESTS
+ help
+ Build KUnit tests for initramfs. See Documentation/dev-tools/kunit
+
+choice
+ prompt "Compiler optimization level"
+ default CC_OPTIMIZE_FOR_PERFORMANCE
+
+config CC_OPTIMIZE_FOR_PERFORMANCE
+ bool "Optimize for performance (-O2)"
+ help
+ This is the default optimization level for the kernel, building
+ with the "-O2" compiler flag for best performance and most
+ helpful compile-time warnings.
+
+config CC_OPTIMIZE_FOR_SIZE
+ bool "Optimize for size (-Os)"
+ help
+ Choosing this option will pass "-Os" to your compiler resulting
+ in a smaller kernel.
+
+endchoice
+
+config HAVE_LD_DEAD_CODE_DATA_ELIMINATION
bool
+ help
+ This requires that the arch annotates or otherwise protects
+ its external entry points from being discarded. Linker scripts
+ must also merge .text.*, .data.*, and .bss.* correctly into
+ output sections. Care must be taken not to pull in unrelated
+ sections (e.g., '.text.init'). Typically '.' in section names
+ is used to distinguish them from label names / C identifiers.
-config ANON_INODES
+config LD_DEAD_CODE_DATA_ELIMINATION
+ bool "Dead code and data elimination (EXPERIMENTAL)"
+ depends on HAVE_LD_DEAD_CODE_DATA_ELIMINATION
+ depends on EXPERT
+ depends on $(cc-option,-ffunction-sections -fdata-sections)
+ depends on $(ld-option,--gc-sections)
+ help
+ Enable this if you want to do dead code and data elimination with
+ the linker by compiling with -ffunction-sections -fdata-sections,
+ and linking with --gc-sections.
+
+ This can reduce on disk and in-memory size of the kernel
+ code and static data, particularly for small configs and
+ on small systems. This has the possibility of introducing
+ silently broken kernel if the required annotations are not
+ present. This option is not well tested yet, so use at your
+ own risk.
+
+config LD_ORPHAN_WARN
+ def_bool y
+ depends on ARCH_WANT_LD_ORPHAN_WARN
+ depends on $(ld-option,--orphan-handling=warn)
+ depends on $(ld-option,--orphan-handling=error)
+
+config LD_ORPHAN_WARN_LEVEL
+ string
+ depends on LD_ORPHAN_WARN
+ default "error" if WERROR
+ default "warn"
+
+config SYSCTL
bool
config HAVE_UID16
@@ -1285,6 +1640,16 @@ config SYSCTL_ARCH_UNALIGN_ALLOW
the unaligned access emulation.
see arch/parisc/kernel/unaligned.c for reference
+config SYSFS_SYSCALL
+ bool "Sysfs syscall support"
+ default n
+ help
+ sys_sysfs is an obsolete system call no longer supported in libc.
+ Note that disabling this option is more secure but might break
+ compatibility with some systems.
+
+ If unsure say N here.
+
config HAVE_PCSPKR_PLATFORM
bool
@@ -1294,58 +1659,70 @@ menuconfig EXPERT
select DEBUG_KERNEL
help
This option allows certain base kernel options and settings
- to be disabled or tweaked. This is for specialized
- environments which can tolerate a "non-standard" kernel.
- Only use this if you really know what you are doing.
+ to be disabled or tweaked. This is for specialized
+ environments which can tolerate a "non-standard" kernel.
+ Only use this if you really know what you are doing.
config UID16
bool "Enable 16-bit UID system calls" if EXPERT
- depends on HAVE_UID16
+ depends on HAVE_UID16 && MULTIUSER
default y
help
This enables the legacy 16-bit UID syscall wrappers.
-config SYSCTL_SYSCALL
- bool "Sysctl syscall support" if EXPERT
- depends on PROC_SYSCTL
- default n
- select SYSCTL
- ---help---
- sys_sysctl uses binary paths that have been found challenging
- to properly maintain and use. The interface in /proc/sys
- using paths with ascii names is now the primary path to this
- information.
+config MULTIUSER
+ bool "Multiple users, groups and capabilities support" if EXPERT
+ default y
+ help
+ This option enables support for non-root users, groups and
+ capabilities.
- Almost nothing using the binary sysctl interface so if you are
- trying to save some space it is probably safe to disable this,
- making your kernel marginally smaller.
+ If you say N here, all processes will run with UID 0, GID 0, and all
+ possible capabilities. Saying N here also compiles out support for
+ system calls related to UIDs, GIDs, and capabilities, such as setuid,
+ setgid, and capset.
- If unsure say N here.
+ If unsure, say Y here.
-config KALLSYMS
- bool "Load all symbols for debugging/ksymoops" if EXPERT
- default y
- help
- Say Y here to let the kernel print out symbolic crash information and
- symbolic stack backtraces. This increases the size of the kernel
- somewhat, as all symbols have to be loaded into the kernel image.
+config SGETMASK_SYSCALL
+ bool "sgetmask/ssetmask syscalls support" if EXPERT
+ default PARISC || M68K || PPC || MIPS || X86 || SPARC || MICROBLAZE || SUPERH
+ help
+ sys_sgetmask and sys_ssetmask are obsolete system calls
+ no longer supported in libc but still enabled by default in some
+ architectures.
-config KALLSYMS_ALL
- bool "Include all symbols in kallsyms"
- depends on DEBUG_KERNEL && KALLSYMS
+ If unsure, leave the default option here.
+
+config FHANDLE
+ bool "open by fhandle syscalls" if EXPERT
+ select EXPORTFS
+ default y
help
- Normally kallsyms only contains the symbols of functions for nicer
- OOPS messages and backtraces (i.e., symbols from the text and inittext
- sections). This is sufficient for most cases. And only in very rare
- cases (e.g., when a debugger is used) all symbols are required (e.g.,
- names of variables from the data sections, etc).
+ If you say Y here, a user level program will be able to map
+ file names to handle and then later use the handle for
+ different file system operations. This is useful in implementing
+ userspace file servers, which now track files using handles instead
+ of names. The handle would remain the same even if file names
+ get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2)
+ syscalls.
+
+config POSIX_TIMERS
+ bool "Posix Clocks & timers" if EXPERT
+ default y
+ help
+ This includes native support for POSIX timers to the kernel.
+ Some embedded systems have no use for them and therefore they
+ can be configured out to reduce the size of the kernel image.
- This option makes sure that all symbols are loaded into the kernel
- image (i.e., symbols from all sections) in cost of increased kernel
- size (depending on the kernel configuration, it may be 300KiB or
- something like this).
+ When this option is disabled, the following syscalls won't be
+ available: timer_create, timer_gettime: timer_getoverrun,
+ timer_settime, timer_delete, clock_adjtime, getitimer,
+ setitimer, alarm. Furthermore, the clock_settime, clock_gettime,
+ clock_getres and clock_nanosleep syscalls will be limited to
+ CLOCK_REALTIME, CLOCK_MONOTONIC and CLOCK_BOOTTIME only.
- Say N unless you really need all symbols.
+ If unsure say y.
config PRINTK
default y
@@ -1358,15 +1735,27 @@ config PRINTK
very difficult to diagnose system problems, saying N here is
strongly discouraged.
+config PRINTK_RINGBUFFER_KUNIT_TEST
+ tristate "KUnit Test for the printk ringbuffer" if !KUNIT_ALL_TESTS
+ depends on PRINTK && KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ This builds the printk ringbuffer KUnit test suite.
+
+ For more information on KUnit and unit tests in general, please refer
+ to the KUnit documentation.
+
+ If unsure, say N.
+
config BUG
bool "BUG() support" if EXPERT
default y
help
- Disabling this option eliminates support for BUG and WARN, reducing
- the size of your kernel image and potentially quietly ignoring
- numerous fatal conditions. You should only consider disabling this
- option for embedded systems with no facilities for reporting errors.
- Just say Y.
+ Disabling this option eliminates support for BUG and WARN, reducing
+ the size of your kernel image and potentially quietly ignoring
+ numerous fatal conditions. You should only consider disabling this
+ option for embedded systems with no facilities for reporting errors.
+ Just say Y.
config ELF_CORE
depends on COREDUMP
@@ -1382,37 +1771,50 @@ config PCSPKR_PLATFORM
select I8253_LOCK
default y
help
- This option allows to disable the internal PC-Speaker
- support, saving some memory.
+ This option allows to disable the internal PC-Speaker
+ support, saving some memory.
-config BASE_FULL
- default y
- bool "Enable full-sized data structures for core" if EXPERT
+config BASE_SMALL
+ bool "Enable smaller-sized data structures for core" if EXPERT
help
- Disabling this option reduces the size of miscellaneous core
+ Enabling this option reduces the size of miscellaneous core
kernel data structures. This saves memory on small machines,
but may reduce performance.
config FUTEX
bool "Enable futex support" if EXPERT
+ depends on !(SPARC32 && SMP)
default y
- select RT_MUTEXES
+ imply RT_MUTEXES
help
Disabling this option will cause the kernel to be built without
support for "fast userspace mutexes". The resulting kernel may not
run glibc-based applications correctly.
+config FUTEX_PI
+ bool
+ depends on FUTEX && RT_MUTEXES
+ default y
+
+config FUTEX_PRIVATE_HASH
+ bool
+ depends on FUTEX && !BASE_SMALL && MMU
+ default y
+
+config FUTEX_MPOL
+ bool
+ depends on FUTEX && NUMA
+ default y
+
config EPOLL
bool "Enable eventpoll support" if EXPERT
default y
- select ANON_INODES
help
Disabling this option will cause the kernel to be built without
support for epoll family of system calls.
config SIGNALFD
bool "Enable signalfd() system call" if EXPERT
- select ANON_INODES
default y
help
Enable the signalfd() system call that allows to receive signals
@@ -1422,7 +1824,6 @@ config SIGNALFD
config TIMERFD
bool "Enable timerfd() system call" if EXPERT
- select ANON_INODES
default y
help
Enable the timerfd() system call that allows to receive timer
@@ -1432,7 +1833,6 @@ config TIMERFD
config EVENTFD
bool "Enable eventfd() system call" if EXPERT
- select ANON_INODES
default y
help
Enable the eventfd() system call that allows to receive both
@@ -1459,28 +1859,208 @@ config AIO
by some high performance threaded applications. Disabling
this option saves about 7k.
-config PCI_QUIRKS
+config IO_URING
+ bool "Enable IO uring support" if EXPERT
+ select IO_WQ
+ default y
+ help
+ This option enables support for the io_uring interface, enabling
+ applications to submit and complete IO through submission and
+ completion rings that are shared between the kernel and application.
+
+config GCOV_PROFILE_URING
+ bool "Enable GCOV profiling on the io_uring subsystem"
+ depends on IO_URING && GCOV_KERNEL
+ help
+ Enable GCOV profiling on the io_uring subsystem, to facilitate
+ code coverage testing.
+
+ If unsure, say N.
+
+ Note that this will have a negative impact on the performance of
+ the io_uring subsystem, hence this should only be enabled for
+ specific test purposes.
+
+config IO_URING_MOCK_FILE
+ tristate "Enable io_uring mock files (Experimental)" if EXPERT
+ default n
+ depends on IO_URING
+ help
+ Enable mock files for io_uring subststem testing. The ABI might
+ still change, so it's still experimental and should only be enabled
+ for specific test purposes.
+
+ If unsure, say N.
+
+config ADVISE_SYSCALLS
+ bool "Enable madvise/fadvise syscalls" if EXPERT
+ default y
+ help
+ This option enables the madvise and fadvise syscalls, used by
+ applications to advise the kernel about their future memory or file
+ usage, improving performance. If building an embedded system where no
+ applications use these syscalls, you can disable this option to save
+ space.
+
+config MEMBARRIER
+ bool "Enable membarrier() system call" if EXPERT
+ default y
+ help
+ Enable the membarrier() system call that allows issuing memory
+ barriers across all running threads, which can be used to distribute
+ the cost of user-space memory barriers asymmetrically by transforming
+ pairs of memory barriers into pairs consisting of membarrier() and a
+ compiler barrier.
+
+ If unsure, say Y.
+
+config KCMP
+ bool "Enable kcmp() system call" if EXPERT
+ help
+ Enable the kernel resource comparison system call. It provides
+ user-space with the ability to compare two processes to see if they
+ share a common resource, such as a file descriptor or even virtual
+ memory space.
+
+ If unsure, say N.
+
+config RSEQ
+ bool "Enable rseq() system call" if EXPERT
+ default y
+ depends on HAVE_RSEQ
+ select MEMBARRIER
+ help
+ Enable the restartable sequences system call. It provides a
+ user-space cache for the current CPU number value, which
+ speeds up getting the current CPU number from user-space,
+ as well as an ABI to speed up user-space operations on
+ per-CPU data.
+
+ If unsure, say Y.
+
+config RSEQ_STATS
+ default n
+ bool "Enable lightweight statistics of restartable sequences" if EXPERT
+ depends on RSEQ && DEBUG_FS
+ help
+ Enable lightweight counters which expose information about the
+ frequency of RSEQ operations via debugfs. Mostly interesting for
+ kernel debugging or performance analysis. While lightweight it's
+ still adding code into the user/kernel mode transitions.
+
+ If unsure, say N.
+
+config RSEQ_DEBUG_DEFAULT_ENABLE
+ default n
+ bool "Enable restartable sequences debug mode by default" if EXPERT
+ depends on RSEQ
+ help
+ This enables the static branch for debug mode of restartable
+ sequences.
+
+ This also can be controlled on the kernel command line via the
+ command line parameter "rseq_debug=0/1" and through debugfs.
+
+ If unsure, say N.
+
+config DEBUG_RSEQ
+ default n
+ bool "Enable debugging of rseq() system call" if EXPERT
+ depends on RSEQ && DEBUG_KERNEL && !GENERIC_ENTRY
+ select RSEQ_DEBUG_DEFAULT_ENABLE
+ help
+ Enable extra debugging checks for the rseq system call.
+
+ If unsure, say N.
+
+config CACHESTAT_SYSCALL
+ bool "Enable cachestat() system call" if EXPERT
default y
- bool "Enable PCI quirk workarounds" if EXPERT
- depends on PCI
help
- This enables workarounds for various PCI chipset
- bugs/quirks. Disable this only if your target machine is
- unaffected by PCI quirks.
+ Enable the cachestat system call, which queries the page cache
+ statistics of a file (number of cached pages, dirty pages,
+ pages marked for writeback, (recently) evicted pages).
-config EMBEDDED
- bool "Embedded system"
- select EXPERT
+ If unsure say Y here.
+
+config KALLSYMS
+ bool "Load all symbols for debugging/ksymoops" if EXPERT
+ default y
help
- This option should be enabled if compiling the kernel for
- an embedded system so certain expert options are available
- for configuration.
+ Say Y here to let the kernel print out symbolic crash information and
+ symbolic stack backtraces. This increases the size of the kernel
+ somewhat, as all symbols have to be loaded into the kernel image.
+
+config KALLSYMS_SELFTEST
+ bool "Test the basic functions and performance of kallsyms"
+ depends on KALLSYMS
+ default n
+ help
+ Test the basic functions and performance of some interfaces, such as
+ kallsyms_lookup_name. It also calculates the compression rate of the
+ kallsyms compression algorithm for the current symbol set.
+
+ Start self-test automatically after system startup. Suggest executing
+ "dmesg | grep kallsyms_selftest" to collect test results. "finish" is
+ displayed in the last line, indicating that the test is complete.
+
+config KALLSYMS_ALL
+ bool "Include all symbols in kallsyms"
+ depends on DEBUG_KERNEL && KALLSYMS
+ help
+ Normally kallsyms only contains the symbols of functions for nicer
+ OOPS messages and backtraces (i.e., symbols from the text and inittext
+ sections). This is sufficient for most cases. And only if you want to
+ enable kernel live patching, or other less common use cases (e.g.,
+ when a debugger is used) all symbols are required (i.e., names of
+ variables from the data sections, etc).
+
+ This option makes sure that all symbols are loaded into the kernel
+ image (i.e., symbols from all sections) in cost of increased kernel
+ size (depending on the kernel configuration, it may be 300KiB or
+ something like this).
+
+ Say N unless you really need all symbols, or kernel live patching.
+
+# end of the "standard kernel features (expert users)" menu
+
+config ARCH_HAS_MEMBARRIER_CALLBACKS
+ bool
+
+config ARCH_HAS_MEMBARRIER_SYNC_CORE
+ bool
+
+config ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS
+ bool
+ help
+ Control MSEAL_SYSTEM_MAPPINGS access based on architecture.
+
+ A 64-bit kernel is required for the memory sealing feature.
+ No specific hardware features from the CPU are needed.
+
+ To enable this feature, the architecture needs to update their
+ special mappings calls to include the sealing flag and confirm
+ that it doesn't unmap/remap system mappings during the life
+ time of the process. The existence of this flag for an architecture
+ implies that it does not require the remapping of the system
+ mappings during process lifetime, so sealing these mappings is safe
+ from a kernel perspective.
+
+ After the architecture enables this, a distribution can set
+ CONFIG_MSEAL_SYSTEM_MAPPING to manage access to the feature.
+
+ For complete descriptions of memory sealing, please see
+ Documentation/userspace-api/mseal.rst
config HAVE_PERF_EVENTS
bool
help
See tools/perf/design.txt for details.
+config GUEST_PERF_EVENTS
+ bool
+ depends on HAVE_PERF_EVENTS
+
config PERF_USE_VMALLOC
bool
help
@@ -1492,7 +2072,6 @@ config PERF_EVENTS
bool "Kernel performance events and counters"
default y if PROFILING
depends on HAVE_PERF_EVENTS
- select ANON_INODES
select IRQ_WORK
help
Enable kernel support for various performance events provided
@@ -1520,120 +2099,85 @@ config PERF_EVENTS
config DEBUG_PERF_USE_VMALLOC
default n
bool "Debug: use vmalloc to back perf mmap() buffers"
- depends on PERF_EVENTS && DEBUG_KERNEL
+ depends on PERF_EVENTS && DEBUG_KERNEL && !PPC
select PERF_USE_VMALLOC
help
- Use vmalloc memory to back perf mmap() buffers.
+ Use vmalloc memory to back perf mmap() buffers.
- Mostly useful for debugging the vmalloc code on platforms
- that don't require it.
+ Mostly useful for debugging the vmalloc code on platforms
+ that don't require it.
- Say N if unsure.
+ Say N if unsure.
endmenu
-config VM_EVENT_COUNTERS
- default y
- bool "Enable VM event counters for /proc/vmstat" if EXPERT
+config SYSTEM_DATA_VERIFICATION
+ def_bool n
+ select SYSTEM_TRUSTED_KEYRING
+ select KEYS
+ select CRYPTO
+ select CRYPTO_RSA
+ select ASYMMETRIC_KEY_TYPE
+ select ASYMMETRIC_PUBLIC_KEY_SUBTYPE
+ select ASN1
+ select OID_REGISTRY
+ select X509_CERTIFICATE_PARSER
+ select PKCS7_MESSAGE_PARSER
help
- VM event counters are needed for event counts to be shown.
- This option allows the disabling of the VM event counters
- on EXPERT systems. /proc/vmstat will only show page counts
- if VM event counters are disabled.
+ Provide PKCS#7 message verification using the contents of the system
+ trusted keyring to provide public keys. This then can be used for
+ module verification, kexec image verification and firmware blob
+ verification.
-config SLUB_DEBUG
- default y
- bool "Enable SLUB debugging support" if EXPERT
- depends on SLUB && SYSFS
+config PROFILING
+ bool "Profiling support"
help
- SLUB has extensive debug support features. Disabling these can
- result in significant savings in code size. This also disables
- SLUB sysfs support. /sys/slab will not exist and there will be
- no support for cache validation etc.
+ Say Y here to enable the extended profiling support mechanisms used
+ by profilers.
-config COMPAT_BRK
- bool "Disable heap randomization"
- default y
+config RUST
+ bool "Rust support"
+ depends on HAVE_RUST
+ depends on RUST_IS_AVAILABLE
+ select EXTENDED_MODVERSIONS if MODVERSIONS
+ depends on !MODVERSIONS || GENDWARFKSYMS
+ depends on !GCC_PLUGIN_RANDSTRUCT
+ depends on !RANDSTRUCT
+ depends on !DEBUG_INFO_BTF || (PAHOLE_HAS_LANG_EXCLUDE && !LTO)
+ depends on !CFI || HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC
+ select CFI_ICALL_NORMALIZE_INTEGERS if CFI
+ depends on !CALL_PADDING || RUSTC_VERSION >= 108100
+ depends on !KASAN_SW_TAGS
+ depends on !(MITIGATION_RETHUNK && KASAN) || RUSTC_VERSION >= 108300
help
- Randomizing heap placement makes heap exploits harder, but it
- also breaks ancient binaries (including anything libc5 based).
- This option changes the bootup default to heap randomization
- disabled, and can be overridden at runtime by setting
- /proc/sys/kernel/randomize_va_space to 2.
+ Enables Rust support in the kernel.
- On non-ancient distros (post-2000 ones) N is usually a safe choice.
+ This allows other Rust-related options, like drivers written in Rust,
+ to be selected.
-choice
- prompt "Choose SLAB allocator"
- default SLUB
- help
- This option allows to select a slab allocator.
+ It is also required to be able to load external kernel modules
+ written in Rust.
-config SLAB
- bool "SLAB"
- help
- The regular slab allocator that is established and known to work
- well in all environments. It organizes cache hot objects in
- per cpu and per node queues.
+ See Documentation/rust/ for more information.
-config SLUB
- bool "SLUB (Unqueued Allocator)"
- help
- SLUB is a slab allocator that minimizes cache line usage
- instead of managing queues of cached objects (SLAB approach).
- Per cpu caching is realized using slabs of objects instead
- of queues of objects. SLUB can use memory efficiently
- and has enhanced diagnostics. SLUB is the default choice for
- a slab allocator.
-
-config SLOB
- depends on EXPERT
- bool "SLOB (Simple Allocator)"
- help
- SLOB replaces the stock allocator with a drastically simpler
- allocator. SLOB is generally more space efficient but
- does not perform as well on large systems.
-
-endchoice
+ If unsure, say N.
-config SLUB_CPU_PARTIAL
- default y
- depends on SLUB
- bool "SLUB per cpu partial cache"
- help
- Per cpu partial caches accellerate objects allocation and freeing
- that is local to a processor at the price of more indeterminism
- in the latency of the free. On overflow these caches will be cleared
- which requires the taking of locks that may cause latency spikes.
- Typically one would choose no for a realtime system.
-
-config MMAP_ALLOW_UNINITIALIZED
- bool "Allow mmapped anonymous memory to be uninitialized"
- depends on EXPERT && !MMU
- default n
+config RUSTC_VERSION_TEXT
+ string
+ depends on RUST
+ default "$(RUSTC_VERSION_TEXT)"
help
- Normally, and according to the Linux spec, anonymous memory obtained
- from mmap() has it's contents cleared before it is passed to
- userspace. Enabling this config option allows you to request that
- mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus
- providing a huge performance boost. If this option is not enabled,
- then the flag will be ignored.
-
- This is taken advantage of by uClibc's malloc(), and also by
- ELF-FDPIC binfmt's brk and stack allocator.
+ See `CC_VERSION_TEXT`.
- Because of the obvious security issues, this option should only be
- enabled on embedded devices where you control what is run in
- userspace. Since that isn't generally a problem on no-MMU systems,
- it is normally safe to say Y here.
-
- See Documentation/nommu-mmap.txt for more information.
-
-config PROFILING
- bool "Profiling support"
- help
- Say Y here to enable the extended profiling support mechanisms used
- by profilers such as OProfile.
+config BINDGEN_VERSION_TEXT
+ string
+ depends on RUST
+ # The dummy parameter `workaround-for-0.69.0` is required to support 0.69.0
+ # (https://github.com/rust-lang/rust-bindgen/pull/2678) and 0.71.0
+ # (https://github.com/rust-lang/rust-bindgen/pull/3040). It can be removed
+ # when the minimum version is upgraded past the latter (0.69.1 and 0.71.1
+ # both fixed the issue).
+ default "$(shell,$(BINDGEN) --version workaround-for-0.69.0 2>/dev/null)"
#
# Place an empty function call at each tracepoint site. Can be
@@ -1641,179 +2185,25 @@ config PROFILING
#
config TRACEPOINTS
bool
+ select TASKS_TRACE_RCU
-source "arch/Kconfig"
+source "kernel/Kconfig.kexec"
-endmenu # General setup
+source "kernel/liveupdate/Kconfig"
-config HAVE_GENERIC_DMA_COHERENT
- bool
- default n
+endmenu # General setup
-config SLABINFO
- bool
- depends on PROC_FS
- depends on SLAB || SLUB_DEBUG
- default y
+source "arch/Kconfig"
config RT_MUTEXES
- boolean
-
-config BASE_SMALL
- int
- default 0 if BASE_FULL
- default 1 if !BASE_FULL
-
-menuconfig MODULES
- bool "Enable loadable module support"
- help
- Kernel modules are small pieces of compiled code which can
- be inserted in the running kernel, rather than being
- permanently built into the kernel. You use the "modprobe"
- tool to add (and sometimes remove) them. If you say Y here,
- many parts of the kernel can be built as modules (by
- answering M instead of Y where indicated): this is most
- useful for infrequently used options which are not required
- for booting. For more information, see the man pages for
- modprobe, lsmod, modinfo, insmod and rmmod.
-
- If you say Y here, you will need to run "make
- modules_install" to put the modules under /lib/modules/
- where modprobe can find them (you may need to be root to do
- this).
-
- If unsure, say Y.
-
-if MODULES
-
-config MODULE_FORCE_LOAD
- bool "Forced module loading"
- default n
- help
- Allow loading of modules without version information (ie. modprobe
- --force). Forced module loading sets the 'F' (forced) taint flag and
- is usually a really bad idea.
-
-config MODULE_UNLOAD
- bool "Module unloading"
- help
- Without this option you will not be able to unload any
- modules (note that some modules may not be unloadable
- anyway), which makes your kernel smaller, faster
- and simpler. If unsure, say Y.
-
-config MODULE_FORCE_UNLOAD
- bool "Forced module unloading"
- depends on MODULE_UNLOAD
- help
- This option allows you to force a module to unload, even if the
- kernel believes it is unsafe: the kernel will remove the module
- without waiting for anyone to stop using it (using the -f option to
- rmmod). This is mainly for kernel developers and desperate users.
- If unsure, say N.
-
-config MODVERSIONS
- bool "Module versioning support"
- help
- Usually, you have to use modules compiled with your kernel.
- Saying Y here makes it sometimes possible to use modules
- compiled for different kernels, by adding enough information
- to the modules to (hopefully) spot any changes which would
- make them incompatible with the kernel you are running. If
- unsure, say N.
-
-config MODULE_SRCVERSION_ALL
- bool "Source checksum for all modules"
- help
- Modules which contain a MODULE_VERSION get an extra "srcversion"
- field inserted into their modinfo section, which contains a
- sum of the source files which made it. This helps maintainers
- see exactly which source was used to build a module (since
- others sometimes change the module source without updating
- the version). With this option, such a "srcversion" field
- will be created for all modules. If unsure, say N.
-
-config MODULE_SIG
- bool "Module signature verification"
- depends on MODULES
- select KEYS
- select CRYPTO
- select ASYMMETRIC_KEY_TYPE
- select ASYMMETRIC_PUBLIC_KEY_SUBTYPE
- select PUBLIC_KEY_ALGO_RSA
- select ASN1
- select OID_REGISTRY
- select X509_CERTIFICATE_PARSER
- help
- Check modules for valid signatures upon load: the signature
- is simply appended to the module. For more information see
- Documentation/module-signing.txt.
-
- !!!WARNING!!! If you enable this option, you MUST make sure that the
- module DOES NOT get stripped after being signed. This includes the
- debuginfo strip done by some packagers (such as rpmbuild) and
- inclusion into an initramfs that wants the module size reduced.
-
-config MODULE_SIG_FORCE
- bool "Require modules to be validly signed"
- depends on MODULE_SIG
- help
- Reject unsigned modules or signed modules for which we don't have a
- key. Without this, such modules will simply taint the kernel.
-
-config MODULE_SIG_ALL
- bool "Automatically sign all modules"
- default y
- depends on MODULE_SIG
- help
- Sign all modules during make modules_install. Without this option,
- modules must be signed manually, using the scripts/sign-file tool.
-
-comment "Do not forget to sign required modules with scripts/sign-file"
- depends on MODULE_SIG_FORCE && !MODULE_SIG_ALL
-
-choice
- prompt "Which hash algorithm should modules be signed with?"
- depends on MODULE_SIG
- help
- This determines which sort of hashing algorithm will be used during
- signature generation. This algorithm _must_ be built into the kernel
- directly so that signature verification can take place. It is not
- possible to load a signed module containing the algorithm to check
- the signature on that module.
-
-config MODULE_SIG_SHA1
- bool "Sign modules with SHA-1"
- select CRYPTO_SHA1
-
-config MODULE_SIG_SHA224
- bool "Sign modules with SHA-224"
- select CRYPTO_SHA256
-
-config MODULE_SIG_SHA256
- bool "Sign modules with SHA-256"
- select CRYPTO_SHA256
-
-config MODULE_SIG_SHA384
- bool "Sign modules with SHA-384"
- select CRYPTO_SHA512
-
-config MODULE_SIG_SHA512
- bool "Sign modules with SHA-512"
- select CRYPTO_SHA512
-
-endchoice
+ bool
+ default y if PREEMPT_RT
-config MODULE_SIG_HASH
- string
- depends on MODULE_SIG
- default "sha1" if MODULE_SIG_SHA1
- default "sha224" if MODULE_SIG_SHA224
- default "sha256" if MODULE_SIG_SHA256
- default "sha384" if MODULE_SIG_SHA384
- default "sha512" if MODULE_SIG_SHA512
+config MODULE_SIG_FORMAT
+ def_bool n
+ select SYSTEM_DATA_VERIFICATION
-endif # MODULES
+source "kernel/module/Kconfig"
config INIT_ALL_POSSIBLE
bool
@@ -1824,13 +2214,6 @@ config INIT_ALL_POSSIBLE
it was better to provide this option than to break all the archs
and have several arch maintainers pursuing me down dark alleys.
-config STOP_MACHINE
- bool
- default y
- depends on (SMP && MODULE_UNLOAD) || HOTPLUG_CPU
- help
- Need stop_machine() primitive.
-
source "block/Kconfig"
config PREEMPT_NOTIFIERS
@@ -1840,12 +2223,6 @@ config PADATA
depends on SMP
bool
-# Can be selected by architectures with broken toolchains
-# that get confused by correct const<->read_only section
-# mappings
-config BROKEN_RODATA
- bool
-
config ASN1
tristate
help
@@ -1855,3 +2232,22 @@ config ASN1
functions to call on what tags.
source "kernel/Kconfig.locks"
+
+config ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ bool
+
+config ARCH_HAS_PREPARE_SYNC_CORE_CMD
+ bool
+
+config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
+ bool
+
+# It may be useful for an architecture to override the definitions of the
+# SYSCALL_DEFINE() and __SYSCALL_DEFINEx() macros in <linux/syscalls.h>
+# and the COMPAT_ variants in <linux/compat.h>, in particular to use a
+# different calling convention for syscalls. They can also override the
+# macros for not-implemented syscalls in kernel/sys_ni.c and
+# kernel/time/posix-stubs.c. All these overrides need to be available in
+# <asm/syscall_wrapper.h>.
+config ARCH_HAS_SYSCALL_WRAPPER
+ def_bool n
diff --git a/init/Makefile b/init/Makefile
index 7bc47ee31c36..d6f75d8907e0 100644
--- a/init/Makefile
+++ b/init/Makefile
@@ -1,7 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the linux kernel.
#
+ccflags-y := -fno-function-sections -fno-data-sections
+
obj-y := main.o version.o mounts.o
ifneq ($(CONFIG_BLK_DEV_INITRD),y)
obj-y += noinitramfs.o
@@ -9,28 +12,51 @@ else
obj-$(CONFIG_BLK_DEV_INITRD) += initramfs.o
endif
obj-$(CONFIG_GENERIC_CALIBRATE_DELAY) += calibrate.o
+obj-$(CONFIG_INITRAMFS_TEST) += initramfs_test.o
-ifneq ($(CONFIG_ARCH_INIT_TASK),y)
obj-y += init_task.o
-endif
mounts-y := do_mounts.o
mounts-$(CONFIG_BLK_DEV_RAM) += do_mounts_rd.o
mounts-$(CONFIG_BLK_DEV_INITRD) += do_mounts_initrd.o
-mounts-$(CONFIG_BLK_DEV_MD) += do_mounts_md.o
-
-# dependencies on generated files need to be listed explicitly
-$(obj)/version.o: include/generated/compile.h
-
-# compile.h changes depending on hostname, generation number, etc,
-# so we regenerate it always.
-# mkcompile_h will make sure to only update the
-# actual file if its content has changed.
-
- chk_compile.h = :
- quiet_chk_compile.h = echo ' CHK $@'
-silent_chk_compile.h = :
-include/generated/compile.h: FORCE
- @$($(quiet)chk_compile.h)
- $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
- "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
+
+#
+# UTS_VERSION
+#
+
+smp-flag-$(CONFIG_SMP) := SMP
+preempt-flag-$(CONFIG_PREEMPT_BUILD) := PREEMPT
+preempt-flag-$(CONFIG_PREEMPT_DYNAMIC) := PREEMPT_DYNAMIC
+preempt-flag-$(CONFIG_PREEMPT_RT) := PREEMPT_RT
+
+build-version = $(or $(KBUILD_BUILD_VERSION), $(build-version-auto))
+build-timestamp = $(or $(KBUILD_BUILD_TIMESTAMP), $(build-timestamp-auto))
+
+# Maximum length of UTS_VERSION is 64 chars
+filechk_uts_version = \
+ utsver=$$(echo '$(pound)'"$(build-version)" $(smp-flag-y) $(preempt-flag-y) "$(build-timestamp)" | cut -b -64); \
+ echo '$(pound)'define UTS_VERSION \""$${utsver}"\"
+
+#
+# Build version.c with temporary UTS_VERSION
+#
+
+$(obj)/utsversion-tmp.h: FORCE
+ $(call filechk,uts_version)
+
+clean-files += utsversion-tmp.h
+
+$(obj)/version.o: $(obj)/utsversion-tmp.h
+CFLAGS_version.o := -include $(obj)/utsversion-tmp.h
+
+#
+# Build version-timestamp.c with final UTS_VERSION
+#
+
+include/generated/utsversion.h: build-version-auto = $(shell $(srctree)/scripts/build-version)
+include/generated/utsversion.h: build-timestamp-auto = $(shell LC_ALL=C date)
+include/generated/utsversion.h: FORCE
+ $(call filechk,uts_version)
+
+$(obj)/version-timestamp.o: include/generated/utsversion.h
+CFLAGS_version-timestamp.o := -include include/generated/utsversion.h
diff --git a/init/calibrate.c b/init/calibrate.c
index fda0a7b0f06c..63be4c65bc52 100644
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -1,22 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
/* calibrate.c: default delay calibration
*
* Excised from init/main.c
* Copyright (C) 1991, 1992 Linus Torvalds
*/
-#include <linux/jiffies.h>
#include <linux/delay.h>
#include <linux/init.h>
-#include <linux/timex.h>
-#include <linux/smp.h>
+#include <linux/jiffies.h>
+#include <linux/kstrtox.h>
#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/timex.h>
unsigned long lpj_fine;
unsigned long preset_lpj;
+
static int __init lpj_setup(char *str)
{
- preset_lpj = simple_strtoul(str,NULL,0);
- return 1;
+ return kstrtoul(str, 0, &preset_lpj) == 0;
}
__setup("lpj=", lpj_setup);
@@ -31,7 +35,7 @@ __setup("lpj=", lpj_setup);
#define DELAY_CALIBRATION_TICKS ((HZ < 100) ? 1 : (HZ/100))
#define MAX_DIRECT_CALIBRATION_RETRIES 5
-static unsigned long __cpuinit calibrate_delay_direct(void)
+static unsigned long calibrate_delay_direct(void)
{
unsigned long pre_start, start, post_start;
unsigned long pre_end, end, post_end;
@@ -166,7 +170,10 @@ static unsigned long __cpuinit calibrate_delay_direct(void)
return 0;
}
#else
-static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;}
+static unsigned long calibrate_delay_direct(void)
+{
+ return 0;
+}
#endif
/*
@@ -180,7 +187,7 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;}
*/
#define LPS_PREC 8
-static unsigned long __cpuinit calibrate_delay_converge(void)
+static unsigned long calibrate_delay_converge(void)
{
/* First stage - slowly accelerate to find initial bounds */
unsigned long lpj, lpj_base, ticks, loopadd, loopadd_base, chop_limit;
@@ -254,12 +261,21 @@ static DEFINE_PER_CPU(unsigned long, cpu_loops_per_jiffy) = { 0 };
* Architectures should override this function if a faster calibration
* method is available.
*/
-unsigned long __attribute__((weak)) __cpuinit calibrate_delay_is_known(void)
+unsigned long __attribute__((weak)) calibrate_delay_is_known(void)
{
return 0;
}
-void __cpuinit calibrate_delay(void)
+/*
+ * Indicate the cpu delay calibration is done. This can be used by
+ * architectures to stop accepting delay timer registrations after this point.
+ */
+
+void __attribute__((weak)) calibration_delay_done(void)
+{
+}
+
+void calibrate_delay(void)
{
unsigned long lpj;
static bool printed;
@@ -298,4 +314,6 @@ void __cpuinit calibrate_delay(void)
loops_per_jiffy = lpj;
printed = true;
+
+ calibration_delay_done();
}
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 816014c4627e..defbbf1d55f7 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -1,13 +1,4 @@
-/*
- * Many of the syscalls used in this file expect some of the arguments
- * to be __user pointers not __kernel pointers. To limit the sparse
- * noise, turn off sparse checking for this file.
- */
-#ifdef __CHECKER__
-#undef __CHECKER__
-#warning "Sparse checking disabled for this file"
-#endif
-
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/ctype.h>
@@ -17,7 +8,6 @@
#include <linux/root_dev.h>
#include <linux/security.h>
#include <linux/delay.h>
-#include <linux/genhd.h>
#include <linux/mount.h>
#include <linux/device.h>
#include <linux/init.h>
@@ -26,17 +16,19 @@
#include <linux/async.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
+#include <linux/ramfs.h>
+#include <linux/shmem_fs.h>
+#include <linux/ktime.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_fs_sb.h>
#include <linux/nfs_mount.h>
+#include <linux/raid/detect.h>
+#include <uapi/linux/mount.h>
#include "do_mounts.h"
-int __initdata rd_doload; /* 1 = load RAM disk, 0 = don't load */
-
int root_mountflags = MS_RDONLY | MS_SILENT;
-static char * __initdata root_device_name;
static char __initdata saved_root_name[64];
static int root_wait;
@@ -44,7 +36,7 @@ dev_t ROOT_DEV;
static int __init load_ramdisk(char *str)
{
- rd_doload = simple_strtol(str,NULL,0) & 3;
+ pr_warn("ignoring the deprecated load_ramdisk= option\n");
return 1;
}
__setup("load_ramdisk=", load_ramdisk);
@@ -68,237 +60,48 @@ static int __init readwrite(char *str)
__setup("ro", readonly);
__setup("rw", readwrite);
-#ifdef CONFIG_BLOCK
-struct uuidcmp {
- const char *uuid;
- int len;
-};
-
-/**
- * match_dev_by_uuid - callback for finding a partition using its uuid
- * @dev: device passed in by the caller
- * @data: opaque pointer to the desired struct uuidcmp to match
- *
- * Returns 1 if the device matches, and 0 otherwise.
- */
-static int match_dev_by_uuid(struct device *dev, const void *data)
+static int __init root_dev_setup(char *line)
{
- const struct uuidcmp *cmp = data;
- struct hd_struct *part = dev_to_part(dev);
-
- if (!part->info)
- goto no_match;
-
- if (strncasecmp(cmp->uuid, part->info->uuid, cmp->len))
- goto no_match;
-
+ strscpy(saved_root_name, line, sizeof(saved_root_name));
return 1;
-no_match:
- return 0;
}
+__setup("root=", root_dev_setup);
-/**
- * devt_from_partuuid - looks up the dev_t of a partition by its UUID
- * @uuid: char array containing ascii UUID
- *
- * The function will return the first partition which contains a matching
- * UUID value in its partition_meta_info struct. This does not search
- * by filesystem UUIDs.
- *
- * If @uuid is followed by a "/PARTNROFF=%d", then the number will be
- * extracted and used as an offset from the partition identified by the UUID.
- *
- * Returns the matching dev_t on success or 0 on failure.
- */
-static dev_t devt_from_partuuid(const char *uuid_str)
+static int __init rootwait_setup(char *str)
{
- dev_t res = 0;
- struct uuidcmp cmp;
- struct device *dev = NULL;
- struct gendisk *disk;
- struct hd_struct *part;
- int offset = 0;
- bool clear_root_wait = false;
- char *slash;
-
- cmp.uuid = uuid_str;
-
- slash = strchr(uuid_str, '/');
- /* Check for optional partition number offset attributes. */
- if (slash) {
- char c = 0;
- /* Explicitly fail on poor PARTUUID syntax. */
- if (sscanf(slash + 1,
- "PARTNROFF=%d%c", &offset, &c) != 1) {
- clear_root_wait = true;
- goto done;
- }
- cmp.len = slash - uuid_str;
- } else {
- cmp.len = strlen(uuid_str);
- }
-
- if (!cmp.len) {
- clear_root_wait = true;
- goto done;
- }
-
- dev = class_find_device(&block_class, NULL, &cmp,
- &match_dev_by_uuid);
- if (!dev)
- goto done;
-
- res = dev->devt;
-
- /* Attempt to find the partition by offset. */
- if (!offset)
- goto no_offset;
-
- res = 0;
- disk = part_to_disk(dev_to_part(dev));
- part = disk_get_part(disk, dev_to_part(dev)->partno + offset);
- if (part) {
- res = part_devt(part);
- put_device(part_to_dev(part));
- }
-
-no_offset:
- put_device(dev);
-done:
- if (clear_root_wait) {
- pr_err("VFS: PARTUUID= is invalid.\n"
- "Expected PARTUUID=<valid-uuid-id>[/PARTNROFF=%%d]\n");
- if (root_wait)
- pr_err("Disabling rootwait; root= is invalid.\n");
- root_wait = 0;
- }
- return res;
+ if (*str)
+ return 0;
+ root_wait = -1;
+ return 1;
}
-#endif
-/*
- * Convert a name into device number. We accept the following variants:
- *
- * 1) device number in hexadecimal represents itself
- * 2) /dev/nfs represents Root_NFS (0xff)
- * 3) /dev/<disk_name> represents the device number of disk
- * 4) /dev/<disk_name><decimal> represents the device number
- * of partition - device number of disk plus the partition number
- * 5) /dev/<disk_name>p<decimal> - same as the above, that form is
- * used when disk name of partitioned disk ends on a digit.
- * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the
- * unique id of a partition if the partition table provides it.
- * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS
- * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero-
- * filled hex representation of the 32-bit "NT disk signature", and PP
- * is a zero-filled hex representation of the 1-based partition number.
- * 7) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to
- * a partition with a known unique id.
- *
- * If name doesn't have fall into the categories above, we return (0,0).
- * block_class is used to check if something is a disk name. If the disk
- * name contains slashes, the device name has them replaced with
- * bangs.
- */
+__setup("rootwait", rootwait_setup);
-dev_t name_to_dev_t(char *name)
+static int __init rootwait_timeout_setup(char *str)
{
- char s[32];
- char *p;
- dev_t res = 0;
- int part;
+ int sec;
-#ifdef CONFIG_BLOCK
- if (strncmp(name, "PARTUUID=", 9) == 0) {
- name += 9;
- res = devt_from_partuuid(name);
- if (!res)
- goto fail;
- goto done;
+ if (kstrtoint(str, 0, &sec) || sec < 0) {
+ pr_warn("ignoring invalid rootwait value\n");
+ goto ignore;
}
-#endif
- if (strncmp(name, "/dev/", 5) != 0) {
- unsigned maj, min;
-
- if (sscanf(name, "%u:%u", &maj, &min) == 2) {
- res = MKDEV(maj, min);
- if (maj != MAJOR(res) || min != MINOR(res))
- goto fail;
- } else {
- res = new_decode_dev(simple_strtoul(name, &p, 16));
- if (*p)
- goto fail;
- }
- goto done;
+ if (check_mul_overflow(sec, MSEC_PER_SEC, &root_wait)) {
+ pr_warn("ignoring excessive rootwait value\n");
+ goto ignore;
}
- name += 5;
- res = Root_NFS;
- if (strcmp(name, "nfs") == 0)
- goto done;
- res = Root_RAM0;
- if (strcmp(name, "ram") == 0)
- goto done;
-
- if (strlen(name) > 31)
- goto fail;
- strcpy(s, name);
- for (p = s; *p; p++)
- if (*p == '/')
- *p = '!';
- res = blk_lookup_devt(s, 0);
- if (res)
- goto done;
-
- /*
- * try non-existent, but valid partition, which may only exist
- * after revalidating the disk, like partitioned md devices
- */
- while (p > s && isdigit(p[-1]))
- p--;
- if (p == s || !*p || *p == '0')
- goto fail;
-
- /* try disk name without <part number> */
- part = simple_strtoul(p, NULL, 10);
- *p = '\0';
- res = blk_lookup_devt(s, part);
- if (res)
- goto done;
-
- /* try disk name without p<part number> */
- if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p')
- goto fail;
- p[-1] = '\0';
- res = blk_lookup_devt(s, part);
- if (res)
- goto done;
-
-fail:
- return 0;
-done:
- return res;
-}
-
-static int __init root_dev_setup(char *line)
-{
- strlcpy(saved_root_name, line, sizeof(saved_root_name));
return 1;
-}
-__setup("root=", root_dev_setup);
+ignore:
+ /* Fallback to indefinite wait */
+ root_wait = -1;
-static int __init rootwait_setup(char *str)
-{
- if (*str)
- return 0;
- root_wait = 1;
return 1;
}
-__setup("rootwait", rootwait_setup);
+__setup("rootwait=", rootwait_timeout_setup);
static char * __initdata root_mount_data;
static int __init root_data_setup(char *str)
@@ -317,7 +120,8 @@ static int __init fs_names_setup(char *str)
static unsigned int __initdata root_delay;
static int __init root_delay_setup(char *str)
{
- root_delay = simple_strtoul(str, NULL, 0);
+ if (kstrtouint(str, 0, &root_delay))
+ return 0;
return 1;
}
@@ -325,74 +129,88 @@ __setup("rootflags=", root_data_setup);
__setup("rootfstype=", fs_names_setup);
__setup("rootdelay=", root_delay_setup);
-static void __init get_fs_names(char *page)
+/* This can return zero length strings. Caller should check */
+static int __init split_fs_names(char *page, size_t size)
{
- char *s = page;
-
- if (root_fs_names) {
- strcpy(page, root_fs_names);
- while (*s++) {
- if (s[-1] == ',')
- s[-1] = '\0';
- }
- } else {
- int len = get_filesystem_list(page);
- char *p, *next;
-
- page[len] = '\0';
- for (p = page-1; p; p = next) {
- next = strchr(++p, '\n');
- if (*p++ != '\t')
- continue;
- while ((*s++ = *p++) != '\n')
- ;
- s[-1] = '\0';
+ int count = 1;
+ char *p = page;
+
+ strscpy(p, root_fs_names, size);
+ while (*p++) {
+ if (p[-1] == ',') {
+ p[-1] = '\0';
+ count++;
}
}
- *s = '\0';
+
+ return count;
}
-static int __init do_mount_root(char *name, char *fs, int flags, void *data)
+static int __init do_mount_root(const char *name, const char *fs,
+ const int flags, const void *data)
{
struct super_block *s;
- int err = sys_mount(name, "/root", fs, flags, data);
- if (err)
- return err;
+ struct page *p = NULL;
+ char *data_page = NULL;
+ int ret;
+
+ if (data) {
+ /* init_mount() requires a full page as fifth argument */
+ p = alloc_page(GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ data_page = page_address(p);
+ strscpy_pad(data_page, data, PAGE_SIZE);
+ }
- sys_chdir("/root");
+ ret = init_mount(name, "/root", fs, flags, data_page);
+ if (ret)
+ goto out;
+
+ init_chdir("/root");
s = current->fs->pwd.dentry->d_sb;
ROOT_DEV = s->s_dev;
printk(KERN_INFO
"VFS: Mounted root (%s filesystem)%s on device %u:%u.\n",
s->s_type->name,
- s->s_flags & MS_RDONLY ? " readonly" : "",
+ sb_rdonly(s) ? " readonly" : "",
MAJOR(ROOT_DEV), MINOR(ROOT_DEV));
- return 0;
+
+out:
+ if (p)
+ put_page(p);
+ return ret;
}
-void __init mount_block_root(char *name, int flags)
+void __init mount_root_generic(char *name, char *pretty_name, int flags)
{
- struct page *page = alloc_page(GFP_KERNEL |
- __GFP_NOTRACK_FALSE_POSITIVE);
+ struct page *page = alloc_page(GFP_KERNEL);
char *fs_names = page_address(page);
char *p;
-#ifdef CONFIG_BLOCK
char b[BDEVNAME_SIZE];
-#else
- const char *b = name;
-#endif
-
- get_fs_names(fs_names);
+ int num_fs, i;
+
+ scnprintf(b, BDEVNAME_SIZE, "unknown-block(%u,%u)",
+ MAJOR(ROOT_DEV), MINOR(ROOT_DEV));
+ if (root_fs_names)
+ num_fs = split_fs_names(fs_names, PAGE_SIZE);
+ else
+ num_fs = list_bdev_fs_names(fs_names, PAGE_SIZE);
retry:
- for (p = fs_names; *p; p += strlen(p)+1) {
- int err = do_mount_root(name, p, flags, root_mount_data);
+ for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1) {
+ int err;
+
+ if (!*p)
+ continue;
+ err = do_mount_root(name, p, flags, root_mount_data);
switch (err) {
case 0:
goto out;
case -EACCES:
- flags |= MS_RDONLY;
- goto retry;
case -EINVAL:
+#ifdef CONFIG_BLOCK
+ init_flush_fput();
+#endif
continue;
}
/*
@@ -400,31 +218,36 @@ retry:
* and bad superblock on root device.
* and give them a list of the available devices
*/
-#ifdef CONFIG_BLOCK
- __bdevname(ROOT_DEV, b);
-#endif
printk("VFS: Cannot open root device \"%s\" or %s: error %d\n",
- root_device_name, b, err);
+ pretty_name, b, err);
printk("Please append a correct \"root=\" boot option; here are the available partitions:\n");
-
printk_all_partitions();
-#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
- printk("DEBUG_BLOCK_EXT_DEVT is enabled, you need to specify "
- "explicit textual name for \"root=\" boot option.\n");
-#endif
+
+ if (root_fs_names)
+ num_fs = list_bdev_fs_names(fs_names, PAGE_SIZE);
+ if (!num_fs)
+ pr_err("Can't find any bdev filesystem to be used for mount!\n");
+ else {
+ pr_err("List of all bdev filesystems:\n");
+ for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1)
+ pr_err(" %s", p);
+ pr_err("\n");
+ }
+
panic("VFS: Unable to mount root fs on %s", b);
}
+ if (!(flags & SB_RDONLY)) {
+ flags |= SB_RDONLY;
+ goto retry;
+ }
printk("List of all partitions:\n");
printk_all_partitions();
printk("No filesystem could mount root, tried: ");
- for (p = fs_names; *p; p += strlen(p)+1)
+ for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1)
printk(" %s", p);
printk("\n");
-#ifdef CONFIG_BLOCK
- __bdevname(ROOT_DEV, b);
-#endif
- panic("VFS: Unable to mount root fs on %s", b);
+ panic("VFS: Unable to mount root fs on \"%s\" or %s", pretty_name, b);
out:
put_page(page);
}
@@ -435,15 +258,14 @@ out:
#define NFSROOT_TIMEOUT_MAX 30
#define NFSROOT_RETRY_MAX 5
-static int __init mount_nfs_root(void)
+static void __init mount_nfs_root(void)
{
char *root_dev, *root_data;
unsigned int timeout;
- int try, err;
+ int try;
- err = nfs_root_data(&root_dev, &root_data);
- if (err != 0)
- return 0;
+ if (nfs_root_data(&root_dev, &root_data))
+ goto fail;
/*
* The server or network may not be ready, so try several
@@ -452,10 +274,8 @@ static int __init mount_nfs_root(void)
*/
timeout = NFSROOT_TIMEOUT_MIN;
for (try = 1; ; try++) {
- err = do_mount_root(root_dev, "nfs",
- root_mountflags, root_data);
- if (err == 0)
- return 1;
+ if (!do_mount_root(root_dev, "nfs", root_mountflags, root_data))
+ return;
if (try > NFSROOT_RETRY_MAX)
break;
@@ -465,67 +285,178 @@ static int __init mount_nfs_root(void)
if (timeout > NFSROOT_TIMEOUT_MAX)
timeout = NFSROOT_TIMEOUT_MAX;
}
- return 0;
+fail:
+ pr_err("VFS: Unable to mount root fs via NFS.\n");
}
-#endif
+#else
+static inline void mount_nfs_root(void)
+{
+}
+#endif /* CONFIG_ROOT_NFS */
-#if defined(CONFIG_BLK_DEV_RAM) || defined(CONFIG_BLK_DEV_FD)
-void __init change_floppy(char *fmt, ...)
+#ifdef CONFIG_CIFS_ROOT
+
+#define CIFSROOT_TIMEOUT_MIN 5
+#define CIFSROOT_TIMEOUT_MAX 30
+#define CIFSROOT_RETRY_MAX 5
+
+static void __init mount_cifs_root(void)
{
- struct termios termios;
- char buf[80];
- char c;
- int fd;
- va_list args;
- va_start(args, fmt);
- vsprintf(buf, fmt, args);
- va_end(args);
- fd = sys_open("/dev/root", O_RDWR | O_NDELAY, 0);
- if (fd >= 0) {
- sys_ioctl(fd, FDEJECT, 0);
- sys_close(fd);
- }
- printk(KERN_NOTICE "VFS: Insert %s and press ENTER\n", buf);
- fd = sys_open("/dev/console", O_RDWR, 0);
- if (fd >= 0) {
- sys_ioctl(fd, TCGETS, (long)&termios);
- termios.c_lflag &= ~ICANON;
- sys_ioctl(fd, TCSETSF, (long)&termios);
- sys_read(fd, &c, 1);
- termios.c_lflag |= ICANON;
- sys_ioctl(fd, TCSETSF, (long)&termios);
- sys_close(fd);
+ char *root_dev, *root_data;
+ unsigned int timeout;
+ int try;
+
+ if (cifs_root_data(&root_dev, &root_data))
+ goto fail;
+
+ timeout = CIFSROOT_TIMEOUT_MIN;
+ for (try = 1; ; try++) {
+ if (!do_mount_root(root_dev, "cifs", root_mountflags,
+ root_data))
+ return;
+ if (try > CIFSROOT_RETRY_MAX)
+ break;
+
+ ssleep(timeout);
+ timeout <<= 1;
+ if (timeout > CIFSROOT_TIMEOUT_MAX)
+ timeout = CIFSROOT_TIMEOUT_MAX;
}
+fail:
+ pr_err("VFS: Unable to mount root fs via SMB.\n");
}
-#endif
+#else
+static inline void mount_cifs_root(void)
+{
+}
+#endif /* CONFIG_CIFS_ROOT */
-void __init mount_root(void)
+static bool __init fs_is_nodev(char *fstype)
{
-#ifdef CONFIG_ROOT_NFS
- if (ROOT_DEV == Root_NFS) {
- if (mount_nfs_root())
- return;
+ struct file_system_type *fs = get_fs_type(fstype);
+ bool ret = false;
- printk(KERN_ERR "VFS: Unable to mount root fs via NFS, trying floppy.\n");
- ROOT_DEV = Root_FD0;
+ if (fs) {
+ ret = !(fs->fs_flags & FS_REQUIRES_DEV);
+ put_filesystem(fs);
}
-#endif
-#ifdef CONFIG_BLK_DEV_FD
- if (MAJOR(ROOT_DEV) == FLOPPY_MAJOR) {
- /* rd_doload is 2 for a dual initrd/ramload setup */
- if (rd_doload==2) {
- if (rd_load_disk(1)) {
- ROOT_DEV = Root_RAM1;
- root_device_name = NULL;
- }
- } else
- change_floppy("root floppy");
+
+ return ret;
+}
+
+static int __init mount_nodev_root(char *root_device_name)
+{
+ char *fs_names, *fstype;
+ int err = -EINVAL;
+ int num_fs, i;
+
+ fs_names = (void *)__get_free_page(GFP_KERNEL);
+ if (!fs_names)
+ return -EINVAL;
+ num_fs = split_fs_names(fs_names, PAGE_SIZE);
+
+ for (i = 0, fstype = fs_names; i < num_fs;
+ i++, fstype += strlen(fstype) + 1) {
+ if (!*fstype)
+ continue;
+ if (!fs_is_nodev(fstype))
+ continue;
+ err = do_mount_root(root_device_name, fstype, root_mountflags,
+ root_mount_data);
+ if (!err)
+ break;
}
-#endif
+
+ free_page((unsigned long)fs_names);
+ return err;
+}
+
#ifdef CONFIG_BLOCK
- create_dev("/dev/root", ROOT_DEV);
- mount_block_root("/dev/root", root_mountflags);
-#endif
+static void __init mount_block_root(char *root_device_name)
+{
+ int err = create_dev("/dev/root", ROOT_DEV);
+
+ if (err < 0)
+ pr_emerg("Failed to create /dev/root: %d\n", err);
+ mount_root_generic("/dev/root", root_device_name, root_mountflags);
+}
+#else
+static inline void mount_block_root(char *root_device_name)
+{
+}
+#endif /* CONFIG_BLOCK */
+
+void __init mount_root(char *root_device_name)
+{
+ switch (ROOT_DEV) {
+ case Root_NFS:
+ mount_nfs_root();
+ break;
+ case Root_CIFS:
+ mount_cifs_root();
+ break;
+ case Root_Generic:
+ mount_root_generic(root_device_name, root_device_name,
+ root_mountflags);
+ break;
+ case 0:
+ if (root_device_name && root_fs_names &&
+ mount_nodev_root(root_device_name) == 0)
+ break;
+ fallthrough;
+ default:
+ mount_block_root(root_device_name);
+ break;
+ }
+}
+
+/* wait for any asynchronous scanning to complete */
+static void __init wait_for_root(char *root_device_name)
+{
+ ktime_t end;
+
+ if (ROOT_DEV != 0)
+ return;
+
+ pr_info("Waiting for root device %s...\n", root_device_name);
+
+ end = ktime_add_ms(ktime_get_raw(), root_wait);
+
+ while (!driver_probe_done() ||
+ early_lookup_bdev(root_device_name, &ROOT_DEV) < 0) {
+ msleep(5);
+ if (root_wait > 0 && ktime_after(ktime_get_raw(), end))
+ break;
+ }
+
+ async_synchronize_full();
+
+}
+
+static dev_t __init parse_root_device(char *root_device_name)
+{
+ int error;
+ dev_t dev;
+
+ if (!strncmp(root_device_name, "mtd", 3) ||
+ !strncmp(root_device_name, "ubi", 3))
+ return Root_Generic;
+ if (strcmp(root_device_name, "/dev/nfs") == 0)
+ return Root_NFS;
+ if (strcmp(root_device_name, "/dev/cifs") == 0)
+ return Root_CIFS;
+ if (strcmp(root_device_name, "/dev/ram") == 0)
+ return Root_RAM0;
+
+ error = early_lookup_bdev(root_device_name, &dev);
+ if (error) {
+ if (error == -EINVAL && root_wait) {
+ pr_err("Disabling rootwait; root= is invalid.\n");
+ root_wait = 0;
+ }
+ return 0;
+ }
+ return dev;
}
/*
@@ -533,8 +464,6 @@ void __init mount_root(void)
*/
void __init prepare_namespace(void)
{
- int is_floppy;
-
if (root_delay) {
printk(KERN_INFO "Waiting %d sec before mounting root device...\n",
root_delay);
@@ -552,39 +481,42 @@ void __init prepare_namespace(void)
md_run_setup();
- if (saved_root_name[0]) {
- root_device_name = saved_root_name;
- if (!strncmp(root_device_name, "mtd", 3) ||
- !strncmp(root_device_name, "ubi", 3)) {
- mount_block_root(root_device_name, root_mountflags);
- goto out;
- }
- ROOT_DEV = name_to_dev_t(root_device_name);
- if (strncmp(root_device_name, "/dev/", 5) == 0)
- root_device_name += 5;
- }
+ if (saved_root_name[0])
+ ROOT_DEV = parse_root_device(saved_root_name);
- if (initrd_load())
+ if (initrd_load(saved_root_name))
goto out;
- /* wait for any asynchronous scanning to complete */
- if ((ROOT_DEV == 0) && root_wait) {
- printk(KERN_INFO "Waiting for root device %s...\n",
- saved_root_name);
- while (driver_probe_done() != 0 ||
- (ROOT_DEV = name_to_dev_t(saved_root_name)) == 0)
- msleep(100);
- async_synchronize_full();
- }
+ if (root_wait)
+ wait_for_root(saved_root_name);
+ mount_root(saved_root_name);
+out:
+ devtmpfs_mount();
+ init_mount(".", "/", NULL, MS_MOVE, NULL);
+ init_chroot(".");
+}
+
+static bool is_tmpfs;
+static int rootfs_init_fs_context(struct fs_context *fc)
+{
+ if (IS_ENABLED(CONFIG_TMPFS) && is_tmpfs)
+ return shmem_init_fs_context(fc);
- is_floppy = MAJOR(ROOT_DEV) == FLOPPY_MAJOR;
+ return ramfs_init_fs_context(fc);
+}
- if (is_floppy && rd_doload && rd_load_disk(0))
- ROOT_DEV = Root_RAM0;
+struct file_system_type rootfs_fs_type = {
+ .name = "rootfs",
+ .init_fs_context = rootfs_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
- mount_root();
-out:
- devtmpfs_mount("dev");
- sys_mount(".", "/", NULL, MS_MOVE, NULL);
- sys_chroot(".");
+void __init init_rootfs(void)
+{
+ if (IS_ENABLED(CONFIG_TMPFS)) {
+ if (!saved_root_name[0] && !root_fs_names)
+ is_tmpfs = true;
+ else if (root_fs_names && !!strstr(root_fs_names, "tmpfs"))
+ is_tmpfs = true;
+ }
}
diff --git a/init/do_mounts.h b/init/do_mounts.h
index f5b978a9bb92..6069ea3eb80d 100644
--- a/init/do_mounts.h
+++ b/init/do_mounts.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/init.h>
@@ -7,42 +8,20 @@
#include <linux/mount.h>
#include <linux/major.h>
#include <linux/root_dev.h>
+#include <linux/init_syscalls.h>
+#include <linux/task_work.h>
+#include <linux/file.h>
-void change_floppy(char *fmt, ...);
-void mount_block_root(char *name, int flags);
-void mount_root(void);
+void mount_root_generic(char *name, char *pretty_name, int flags);
+void mount_root(char *root_device_name);
extern int root_mountflags;
-static inline int create_dev(char *name, dev_t dev)
+static inline __init int create_dev(char *name, dev_t dev)
{
- sys_unlink(name);
- return sys_mknod(name, S_IFBLK|0600, new_encode_dev(dev));
+ init_unlink(name);
+ return init_mknod(name, S_IFBLK | 0600, new_encode_dev(dev));
}
-#if BITS_PER_LONG == 32
-static inline u32 bstat(char *name)
-{
- struct stat64 stat;
- if (sys_stat64(name, &stat) != 0)
- return 0;
- if (!S_ISBLK(stat.st_mode))
- return 0;
- if (stat.st_rdev != (u32)stat.st_rdev)
- return 0;
- return stat.st_rdev;
-}
-#else
-static inline u32 bstat(char *name)
-{
- struct stat stat;
- if (sys_newstat(name, &stat) != 0)
- return 0;
- if (!S_ISBLK(stat.st_mode))
- return 0;
- return stat.st_rdev;
-}
-#endif
-
#ifdef CONFIG_BLK_DEV_RAM
int __init rd_load_disk(int n);
@@ -56,21 +35,18 @@ static inline int rd_load_image(char *from) { return 0; }
#endif
#ifdef CONFIG_BLK_DEV_INITRD
-
-int __init initrd_load(void);
-
+bool __init initrd_load(char *root_device_name);
#else
-
-static inline int initrd_load(void) { return 0; }
+static inline bool initrd_load(char *root_device_name)
+{
+ return false;
+ }
#endif
-#ifdef CONFIG_BLK_DEV_MD
-
-void md_run_setup(void);
-
-#else
-
-static inline void md_run_setup(void) {}
-
-#endif
+/* Ensure that async file closing finished to prevent spurious errors. */
+static inline void init_flush_fput(void)
+{
+ flush_delayed_fput();
+ task_work_run();
+}
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index 3e0878e8a80d..f6867bad0d78 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -1,13 +1,4 @@
-/*
- * Many of the syscalls used in this file expect some of the arguments
- * to be __user pointers not __kernel pointers. To limit the sparse
- * noise, turn off sparse checking for this file.
- */
-#ifdef __CHECKER__
-#undef __CHECKER__
-#warning "Sparse checking disabled for this file"
-#endif
-
+// SPDX-License-Identifier: GPL-2.0
#include <linux/unistd.h>
#include <linux/kernel.h>
#include <linux/fs.h>
@@ -17,14 +8,37 @@
#include <linux/sched.h>
#include <linux/freezer.h>
#include <linux/kmod.h>
+#include <uapi/linux/mount.h>
#include "do_mounts.h"
unsigned long initrd_start, initrd_end;
int initrd_below_start_ok;
-unsigned int real_root_dev; /* do_proc_dointvec cannot handle kdev_t */
+static unsigned int real_root_dev; /* do_proc_dointvec cannot handle kdev_t */
static int __initdata mount_initrd = 1;
+phys_addr_t phys_initrd_start __initdata;
+unsigned long phys_initrd_size __initdata;
+
+#ifdef CONFIG_SYSCTL
+static const struct ctl_table kern_do_mounts_initrd_table[] = {
+ {
+ .procname = "real-root-dev",
+ .data = &real_root_dev,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+};
+
+static __init int kernel_do_mounts_initrd_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_do_mounts_initrd_table);
+ return 0;
+}
+late_initcall(kernel_do_mounts_initrd_sysctls_init);
+#endif /* CONFIG_SYSCTL */
+
static int __init no_initrd(char *str)
{
mount_initrd = 0;
@@ -33,90 +47,93 @@ static int __init no_initrd(char *str)
__setup("noinitrd", no_initrd);
-static int init_linuxrc(struct subprocess_info *info, struct cred *new)
+static int __init early_initrdmem(char *p)
{
- sys_unshare(CLONE_FS | CLONE_FILES);
- /* stdin/stdout/stderr for /linuxrc */
- sys_open("/dev/console", O_RDWR, 0);
- sys_dup(0);
- sys_dup(0);
+ phys_addr_t start;
+ unsigned long size;
+ char *endp;
+
+ start = memparse(p, &endp);
+ if (*endp == ',') {
+ size = memparse(endp + 1, NULL);
+
+ phys_initrd_start = start;
+ phys_initrd_size = size;
+ }
+ return 0;
+}
+early_param("initrdmem", early_initrdmem);
+
+static int __init early_initrd(char *p)
+{
+ return early_initrdmem(p);
+}
+early_param("initrd", early_initrd);
+
+static int __init init_linuxrc(struct subprocess_info *info, struct cred *new)
+{
+ ksys_unshare(CLONE_FS | CLONE_FILES);
+ console_on_rootfs();
/* move initrd over / and chdir/chroot in initrd root */
- sys_chdir("/root");
- sys_mount(".", "/", NULL, MS_MOVE, NULL);
- sys_chroot(".");
- sys_setsid();
+ init_chdir("/root");
+ init_mount(".", "/", NULL, MS_MOVE, NULL);
+ init_chroot(".");
+ ksys_setsid();
return 0;
}
-static void __init handle_initrd(void)
+static void __init handle_initrd(char *root_device_name)
{
struct subprocess_info *info;
static char *argv[] = { "linuxrc", NULL, };
extern char *envp_init[];
int error;
+ pr_warn("using deprecated initrd support, will be removed soon.\n");
+
real_root_dev = new_encode_dev(ROOT_DEV);
create_dev("/dev/root.old", Root_RAM0);
/* mount initrd on rootfs' /root */
- mount_block_root("/dev/root.old", root_mountflags & ~MS_RDONLY);
- sys_mkdir("/old", 0700);
- sys_chdir("/old");
-
- /* try loading default modules from initrd */
- load_default_modules();
-
- /*
- * In case that a resume from disk is carried out by linuxrc or one of
- * its children, we need to tell the freezer not to wait for us.
- */
- current->flags |= PF_FREEZER_SKIP;
+ mount_root_generic("/dev/root.old", root_device_name,
+ root_mountflags & ~MS_RDONLY);
+ init_mkdir("/old", 0700);
+ init_chdir("/old");
info = call_usermodehelper_setup("/linuxrc", argv, envp_init,
GFP_KERNEL, init_linuxrc, NULL, NULL);
if (!info)
return;
- call_usermodehelper_exec(info, UMH_WAIT_PROC);
-
- current->flags &= ~PF_FREEZER_SKIP;
+ call_usermodehelper_exec(info, UMH_WAIT_PROC|UMH_FREEZABLE);
/* move initrd to rootfs' /old */
- sys_mount("..", ".", NULL, MS_MOVE, NULL);
+ init_mount("..", ".", NULL, MS_MOVE, NULL);
/* switch root and cwd back to / of rootfs */
- sys_chroot("..");
+ init_chroot("..");
if (new_decode_dev(real_root_dev) == Root_RAM0) {
- sys_chdir("/old");
+ init_chdir("/old");
return;
}
- sys_chdir("/");
+ init_chdir("/");
ROOT_DEV = new_decode_dev(real_root_dev);
- mount_root();
+ mount_root(root_device_name);
printk(KERN_NOTICE "Trying to move old root to /initrd ... ");
- error = sys_mount("/old", "/root/initrd", NULL, MS_MOVE, NULL);
+ error = init_mount("/old", "/root/initrd", NULL, MS_MOVE, NULL);
if (!error)
printk("okay\n");
else {
- int fd = sys_open("/dev/root.old", O_RDWR, 0);
if (error == -ENOENT)
printk("/initrd does not exist. Ignored.\n");
else
printk("failed\n");
printk(KERN_NOTICE "Unmounting old root\n");
- sys_umount("/old", MNT_DETACH);
- printk(KERN_NOTICE "Trying to free ramdisk memory ... ");
- if (fd < 0) {
- error = fd;
- } else {
- error = sys_ioctl(fd, BLKFLSBUF, 0);
- sys_close(fd);
- }
- printk(!error ? "okay\n" : "failed\n");
+ init_umount("/old", MNT_DETACH);
}
}
-int __init initrd_load(void)
+bool __init initrd_load(char *root_device_name)
{
if (mount_initrd) {
create_dev("/dev/ram", Root_RAM0);
@@ -127,11 +144,11 @@ int __init initrd_load(void)
* mounted in the normal path.
*/
if (rd_load_image("/initrd.image") && ROOT_DEV != Root_RAM0) {
- sys_unlink("/initrd.image");
- handle_initrd();
- return 1;
+ init_unlink("/initrd.image");
+ handle_initrd(root_device_name);
+ return true;
}
}
- sys_unlink("/initrd.image");
- return 0;
+ init_unlink("/initrd.image");
+ return false;
}
diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c
deleted file mode 100644
index 8cb6db54285b..000000000000
--- a/init/do_mounts_md.c
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * Many of the syscalls used in this file expect some of the arguments
- * to be __user pointers not __kernel pointers. To limit the sparse
- * noise, turn off sparse checking for this file.
- */
-#ifdef __CHECKER__
-#undef __CHECKER__
-#warning "Sparse checking disabled for this file"
-#endif
-
-#include <linux/delay.h>
-#include <linux/raid/md_u.h>
-#include <linux/raid/md_p.h>
-
-#include "do_mounts.h"
-
-/*
- * When md (and any require personalities) are compiled into the kernel
- * (not a module), arrays can be assembles are boot time using with AUTODETECT
- * where specially marked partitions are registered with md_autodetect_dev(),
- * and with MD_BOOT where devices to be collected are given on the boot line
- * with md=.....
- * The code for that is here.
- */
-
-#ifdef CONFIG_MD_AUTODETECT
-static int __initdata raid_noautodetect;
-#else
-static int __initdata raid_noautodetect=1;
-#endif
-static int __initdata raid_autopart;
-
-static struct {
- int minor;
- int partitioned;
- int level;
- int chunk;
- char *device_names;
-} md_setup_args[256] __initdata;
-
-static int md_setup_ents __initdata;
-
-/*
- * Parse the command-line parameters given our kernel, but do not
- * actually try to invoke the MD device now; that is handled by
- * md_setup_drive after the low-level disk drivers have initialised.
- *
- * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
- * assigns the task of parsing integer arguments to the
- * invoked program now). Added ability to initialise all
- * the MD devices (by specifying multiple "md=" lines)
- * instead of just one. -- KTK
- * 18May2000: Added support for persistent-superblock arrays:
- * md=n,0,factor,fault,device-list uses RAID0 for device n
- * md=n,-1,factor,fault,device-list uses LINEAR for device n
- * md=n,device-list reads a RAID superblock from the devices
- * elements in device-list are read by name_to_kdev_t so can be
- * a hex number or something like /dev/hda1 /dev/sdb
- * 2001-06-03: Dave Cinege <dcinege@psychosis.com>
- * Shifted name_to_kdev_t() and related operations to md_set_drive()
- * for later execution. Rewrote section to make devfs compatible.
- */
-static int __init md_setup(char *str)
-{
- int minor, level, factor, fault, partitioned = 0;
- char *pername = "";
- char *str1;
- int ent;
-
- if (*str == 'd') {
- partitioned = 1;
- str++;
- }
- if (get_option(&str, &minor) != 2) { /* MD Number */
- printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
- return 0;
- }
- str1 = str;
- for (ent=0 ; ent< md_setup_ents ; ent++)
- if (md_setup_args[ent].minor == minor &&
- md_setup_args[ent].partitioned == partitioned) {
- printk(KERN_WARNING "md: md=%s%d, Specified more than once. "
- "Replacing previous definition.\n", partitioned?"d":"", minor);
- break;
- }
- if (ent >= ARRAY_SIZE(md_setup_args)) {
- printk(KERN_WARNING "md: md=%s%d - too many md initialisations\n", partitioned?"d":"", minor);
- return 0;
- }
- if (ent >= md_setup_ents)
- md_setup_ents++;
- switch (get_option(&str, &level)) { /* RAID level */
- case 2: /* could be 0 or -1.. */
- if (level == 0 || level == LEVEL_LINEAR) {
- if (get_option(&str, &factor) != 2 || /* Chunk Size */
- get_option(&str, &fault) != 2) {
- printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
- return 0;
- }
- md_setup_args[ent].level = level;
- md_setup_args[ent].chunk = 1 << (factor+12);
- if (level == LEVEL_LINEAR)
- pername = "linear";
- else
- pername = "raid0";
- break;
- }
- /* FALL THROUGH */
- case 1: /* the first device is numeric */
- str = str1;
- /* FALL THROUGH */
- case 0:
- md_setup_args[ent].level = LEVEL_NONE;
- pername="super-block";
- }
-
- printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n",
- minor, pername, str);
- md_setup_args[ent].device_names = str;
- md_setup_args[ent].partitioned = partitioned;
- md_setup_args[ent].minor = minor;
-
- return 1;
-}
-
-static void __init md_setup_drive(void)
-{
- int minor, i, ent, partitioned;
- dev_t dev;
- dev_t devices[MD_SB_DISKS+1];
-
- for (ent = 0; ent < md_setup_ents ; ent++) {
- int fd;
- int err = 0;
- char *devname;
- mdu_disk_info_t dinfo;
- char name[16];
-
- minor = md_setup_args[ent].minor;
- partitioned = md_setup_args[ent].partitioned;
- devname = md_setup_args[ent].device_names;
-
- sprintf(name, "/dev/md%s%d", partitioned?"_d":"", minor);
- if (partitioned)
- dev = MKDEV(mdp_major, minor << MdpMinorShift);
- else
- dev = MKDEV(MD_MAJOR, minor);
- create_dev(name, dev);
- for (i = 0; i < MD_SB_DISKS && devname != NULL; i++) {
- char *p;
- char comp_name[64];
- u32 rdev;
-
- p = strchr(devname, ',');
- if (p)
- *p++ = 0;
-
- dev = name_to_dev_t(devname);
- if (strncmp(devname, "/dev/", 5) == 0)
- devname += 5;
- snprintf(comp_name, 63, "/dev/%s", devname);
- rdev = bstat(comp_name);
- if (rdev)
- dev = new_decode_dev(rdev);
- if (!dev) {
- printk(KERN_WARNING "md: Unknown device name: %s\n", devname);
- break;
- }
-
- devices[i] = dev;
-
- devname = p;
- }
- devices[i] = 0;
-
- if (!i)
- continue;
-
- printk(KERN_INFO "md: Loading md%s%d: %s\n",
- partitioned ? "_d" : "", minor,
- md_setup_args[ent].device_names);
-
- fd = sys_open(name, 0, 0);
- if (fd < 0) {
- printk(KERN_ERR "md: open failed - cannot start "
- "array %s\n", name);
- continue;
- }
- if (sys_ioctl(fd, SET_ARRAY_INFO, 0) == -EBUSY) {
- printk(KERN_WARNING
- "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n",
- minor);
- sys_close(fd);
- continue;
- }
-
- if (md_setup_args[ent].level != LEVEL_NONE) {
- /* non-persistent */
- mdu_array_info_t ainfo;
- ainfo.level = md_setup_args[ent].level;
- ainfo.size = 0;
- ainfo.nr_disks =0;
- ainfo.raid_disks =0;
- while (devices[ainfo.raid_disks])
- ainfo.raid_disks++;
- ainfo.md_minor =minor;
- ainfo.not_persistent = 1;
-
- ainfo.state = (1 << MD_SB_CLEAN);
- ainfo.layout = 0;
- ainfo.chunk_size = md_setup_args[ent].chunk;
- err = sys_ioctl(fd, SET_ARRAY_INFO, (long)&ainfo);
- for (i = 0; !err && i <= MD_SB_DISKS; i++) {
- dev = devices[i];
- if (!dev)
- break;
- dinfo.number = i;
- dinfo.raid_disk = i;
- dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC);
- dinfo.major = MAJOR(dev);
- dinfo.minor = MINOR(dev);
- err = sys_ioctl(fd, ADD_NEW_DISK, (long)&dinfo);
- }
- } else {
- /* persistent */
- for (i = 0; i <= MD_SB_DISKS; i++) {
- dev = devices[i];
- if (!dev)
- break;
- dinfo.major = MAJOR(dev);
- dinfo.minor = MINOR(dev);
- sys_ioctl(fd, ADD_NEW_DISK, (long)&dinfo);
- }
- }
- if (!err)
- err = sys_ioctl(fd, RUN_ARRAY, 0);
- if (err)
- printk(KERN_WARNING "md: starting md%d failed\n", minor);
- else {
- /* reread the partition table.
- * I (neilb) and not sure why this is needed, but I cannot
- * boot a kernel with devfs compiled in from partitioned md
- * array without it
- */
- sys_close(fd);
- fd = sys_open(name, 0, 0);
- sys_ioctl(fd, BLKRRPART, 0);
- }
- sys_close(fd);
- }
-}
-
-static int __init raid_setup(char *str)
-{
- int len, pos;
-
- len = strlen(str) + 1;
- pos = 0;
-
- while (pos < len) {
- char *comma = strchr(str+pos, ',');
- int wlen;
- if (comma)
- wlen = (comma-str)-pos;
- else wlen = (len-1)-pos;
-
- if (!strncmp(str, "noautodetect", wlen))
- raid_noautodetect = 1;
- if (!strncmp(str, "autodetect", wlen))
- raid_noautodetect = 0;
- if (strncmp(str, "partitionable", wlen)==0)
- raid_autopart = 1;
- if (strncmp(str, "part", wlen)==0)
- raid_autopart = 1;
- pos += wlen+1;
- }
- return 1;
-}
-
-__setup("raid=", raid_setup);
-__setup("md=", md_setup);
-
-static void __init autodetect_raid(void)
-{
- int fd;
-
- /*
- * Since we don't want to detect and use half a raid array, we need to
- * wait for the known devices to complete their probing
- */
- printk(KERN_INFO "md: Waiting for all devices to be available before autodetect\n");
- printk(KERN_INFO "md: If you don't use raid, use raid=noautodetect\n");
-
- wait_for_device_probe();
-
- fd = sys_open("/dev/md0", 0, 0);
- if (fd >= 0) {
- sys_ioctl(fd, RAID_AUTORUN, raid_autopart);
- sys_close(fd);
- }
-}
-
-void __init md_run_setup(void)
-{
- create_dev("/dev/md0", MKDEV(MD_MAJOR, 0));
-
- if (raid_noautodetect)
- printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=autodetect will force)\n");
- else
- autodetect_raid();
- md_setup_drive();
-}
diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c
index 6be2879cca66..eddbe5cb0413 100644
--- a/init/do_mounts_rd.c
+++ b/init/do_mounts_rd.c
@@ -1,21 +1,13 @@
-/*
- * Many of the syscalls used in this file expect some of the arguments
- * to be __user pointers not __kernel pointers. To limit the sparse
- * noise, turn off sparse checking for this file.
- */
-#ifdef __CHECKER__
-#undef __CHECKER__
-#warning "Sparse checking disabled for this file"
-#endif
-
+// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/minix_fs.h>
#include <linux/ext2_fs.h>
#include <linux/romfs_fs.h>
-#include <linux/cramfs_fs.h>
+#include <uapi/linux/cramfs_fs.h>
#include <linux/initrd.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/slab.h>
#include "do_mounts.h"
@@ -23,12 +15,12 @@
#include <linux/decompress/generic.h>
-
-int __initdata rd_prompt = 1;/* 1 = prompt for RAM disk, 0 = don't prompt */
+static struct file *in_file, *out_file;
+static loff_t in_pos, out_pos;
static int __init prompt_ramdisk(char *str)
{
- rd_prompt = simple_strtol(str,NULL,0) & 1;
+ pr_warn("ignoring the deprecated prompt_ramdisk= option\n");
return 1;
}
__setup("prompt_ramdisk=", prompt_ramdisk);
@@ -37,12 +29,11 @@ int __initdata rd_image_start; /* starting block # of image */
static int __init ramdisk_start_setup(char *str)
{
- rd_image_start = simple_strtol(str,NULL,0);
- return 1;
+ return kstrtoint(str, 0, &rd_image_start) == 0;
}
__setup("ramdisk_start=", ramdisk_start_setup);
-static int __init crd_load(int in_fd, int out_fd, decompress_fn deco);
+static int __init crd_load(decompress_fn deco);
/*
* This routine tries to find a RAM disk image to load, and returns the
@@ -57,9 +48,15 @@ static int __init crd_load(int in_fd, int out_fd, decompress_fn deco);
* cramfs
* squashfs
* gzip
+ * bzip2
+ * lzma
+ * xz
+ * lzo
+ * lz4
*/
static int __init
-identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor)
+identify_ramdisk_image(struct file *file, loff_t pos,
+ decompress_fn *decompressor)
{
const int size = 512;
struct minix_super_block *minixsb;
@@ -70,6 +67,7 @@ identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor)
unsigned char *buf;
const char *compress_name;
unsigned long n;
+ int start_block = rd_image_start;
buf = kmalloc(size, GFP_KERNEL);
if (!buf)
@@ -84,8 +82,8 @@ identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor)
/*
* Read block 0 to test for compressed kernel
*/
- sys_lseek(fd, start_block * BLOCK_SIZE, 0);
- sys_read(fd, buf, size);
+ pos = start_block * BLOCK_SIZE;
+ kernel_read(file, buf, size, &pos);
*decompressor = decompress_method(buf, size, &compress_name);
if (compress_name) {
@@ -130,8 +128,8 @@ identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor)
/*
* Read 512 bytes further to check if cramfs is padded
*/
- sys_lseek(fd, start_block * BLOCK_SIZE + 0x200, 0);
- sys_read(fd, buf, size);
+ pos = start_block * BLOCK_SIZE + 0x200;
+ kernel_read(file, buf, size, &pos);
if (cramfsb->magic == CRAMFS_MAGIC) {
printk(KERN_NOTICE
@@ -144,8 +142,8 @@ identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor)
/*
* Read block 1 to test for minix and ext2 superblock
*/
- sys_lseek(fd, (start_block+1) * BLOCK_SIZE, 0);
- sys_read(fd, buf, size);
+ pos = (start_block + 1) * BLOCK_SIZE;
+ kernel_read(file, buf, size, &pos);
/* Try minix */
if (minixsb->s_magic == MINIX_SUPER_MAGIC ||
@@ -172,38 +170,44 @@ identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor)
start_block);
done:
- sys_lseek(fd, start_block * BLOCK_SIZE, 0);
kfree(buf);
return nblocks;
}
+static unsigned long nr_blocks(struct file *file)
+{
+ struct inode *inode = file->f_mapping->host;
+
+ if (!S_ISBLK(inode->i_mode))
+ return 0;
+ return i_size_read(inode) >> 10;
+}
+
int __init rd_load_image(char *from)
{
int res = 0;
- int in_fd, out_fd;
- unsigned long rd_blocks, devblocks;
- int nblocks, i, disk;
+ unsigned long rd_blocks, devblocks, nr_disks;
+ int nblocks, i;
char *buf = NULL;
unsigned short rotate = 0;
decompress_fn decompressor = NULL;
-#if !defined(CONFIG_S390)
char rotator[4] = { '|' , '/' , '-' , '\\' };
-#endif
- out_fd = sys_open("/dev/ram", O_RDWR, 0);
- if (out_fd < 0)
+ out_file = filp_open("/dev/ram", O_RDWR, 0);
+ if (IS_ERR(out_file))
goto out;
- in_fd = sys_open(from, O_RDONLY, 0);
- if (in_fd < 0)
+ in_file = filp_open(from, O_RDONLY, 0);
+ if (IS_ERR(in_file))
goto noclose_input;
- nblocks = identify_ramdisk_image(in_fd, rd_image_start, &decompressor);
+ in_pos = rd_image_start * BLOCK_SIZE;
+ nblocks = identify_ramdisk_image(in_file, in_pos, &decompressor);
if (nblocks < 0)
goto done;
if (nblocks == 0) {
- if (crd_load(in_fd, out_fd, decompressor) == 0)
+ if (crd_load(decompressor) == 0)
goto successful_load;
goto done;
}
@@ -211,19 +215,8 @@ int __init rd_load_image(char *from)
/*
* NOTE NOTE: nblocks is not actually blocks but
* the number of kibibytes of data to load into a ramdisk.
- * So any ramdisk block size that is a multiple of 1KiB should
- * work when the appropriate ramdisk_blocksize is specified
- * on the command line.
- *
- * The default ramdisk_blocksize is 1KiB and it is generally
- * silly to use anything else, so make sure to use 1KiB
- * blocksize while generating ext2fs ramdisk-images.
*/
- if (sys_ioctl(out_fd, BLKGETSIZE, (unsigned long)&rd_blocks) < 0)
- rd_blocks = 0;
- else
- rd_blocks >>= 1;
-
+ rd_blocks = nr_blocks(out_file);
if (nblocks > rd_blocks) {
printk("RAMDISK: image too big! (%dKiB/%ldKiB)\n",
nblocks, rd_blocks);
@@ -233,13 +226,10 @@ int __init rd_load_image(char *from)
/*
* OK, time to copy in the data
*/
- if (sys_ioctl(in_fd, BLKGETSIZE, (unsigned long)&devblocks) < 0)
- devblocks = 0;
- else
- devblocks >>= 1;
-
if (strcmp(from, "/initrd.image") == 0)
devblocks = nblocks;
+ else
+ devblocks = nr_blocks(in_file);
if (devblocks == 0) {
printk(KERN_ERR "RAMDISK: could not determine device size\n");
@@ -252,51 +242,39 @@ int __init rd_load_image(char *from)
goto done;
}
- printk(KERN_NOTICE "RAMDISK: Loading %dKiB [%ld disk%s] into ram disk... ",
- nblocks, ((nblocks-1)/devblocks)+1, nblocks>devblocks ? "s" : "");
- for (i = 0, disk = 1; i < nblocks; i++) {
+ nr_disks = (nblocks - 1) / devblocks + 1;
+ pr_notice("RAMDISK: Loading %dKiB [%ld disk%s] into ram disk... ",
+ nblocks, nr_disks, str_plural(nr_disks));
+ for (i = 0; i < nblocks; i++) {
if (i && (i % devblocks == 0)) {
- printk("done disk #%d.\n", disk++);
+ pr_cont("done disk #1.\n");
rotate = 0;
- if (sys_close(in_fd)) {
- printk("Error closing the disk.\n");
- goto noclose_input;
- }
- change_floppy("disk #%d", disk);
- in_fd = sys_open(from, O_RDONLY, 0);
- if (in_fd < 0) {
- printk("Error opening disk.\n");
- goto noclose_input;
- }
- printk("Loading disk #%d... ", disk);
+ fput(in_file);
+ break;
}
- sys_read(in_fd, buf, BLOCK_SIZE);
- sys_write(out_fd, buf, BLOCK_SIZE);
-#if !defined(CONFIG_S390)
- if (!(i % 16)) {
- printk("%c\b", rotator[rotate & 0x3]);
+ kernel_read(in_file, buf, BLOCK_SIZE, &in_pos);
+ kernel_write(out_file, buf, BLOCK_SIZE, &out_pos);
+ if (!IS_ENABLED(CONFIG_S390) && !(i % 16)) {
+ pr_cont("%c\b", rotator[rotate & 0x3]);
rotate++;
}
-#endif
}
- printk("done.\n");
+ pr_cont("done.\n");
successful_load:
res = 1;
done:
- sys_close(in_fd);
+ fput(in_file);
noclose_input:
- sys_close(out_fd);
+ fput(out_file);
out:
kfree(buf);
- sys_unlink("/dev/ram");
+ init_unlink("/dev/ram");
return res;
}
int __init rd_load_disk(int n)
{
- if (rd_prompt)
- change_floppy("root floppy disk to be loaded into RAM disk");
create_dev("/dev/root", ROOT_DEV);
create_dev("/dev/ram", MKDEV(RAMDISK_MAJOR, n));
return rd_load_image("/dev/root");
@@ -304,11 +282,10 @@ int __init rd_load_disk(int n)
static int exit_code;
static int decompress_error;
-static int crd_infd, crd_outfd;
-static int __init compr_fill(void *buf, unsigned int len)
+static long __init compr_fill(void *buf, unsigned long len)
{
- int r = sys_read(crd_infd, buf, len);
+ long r = kernel_read(in_file, buf, len, &in_pos);
if (r < 0)
printk(KERN_ERR "RAMDISK: error while reading compressed data");
else if (r == 0)
@@ -316,13 +293,13 @@ static int __init compr_fill(void *buf, unsigned int len)
return r;
}
-static int __init compr_flush(void *window, unsigned int outcnt)
+static long __init compr_flush(void *window, unsigned long outcnt)
{
- int written = sys_write(crd_outfd, window, outcnt);
+ long written = kernel_write(out_file, window, outcnt, &out_pos);
if (written != outcnt) {
if (decompress_error == 0)
printk(KERN_ERR
- "RAMDISK: incomplete write (%d != %d)\n",
+ "RAMDISK: incomplete write (%ld != %ld)\n",
written, outcnt);
decompress_error = 1;
return -1;
@@ -337,11 +314,16 @@ static void __init error(char *x)
decompress_error = 1;
}
-static int __init crd_load(int in_fd, int out_fd, decompress_fn deco)
+static int __init crd_load(decompress_fn deco)
{
int result;
- crd_infd = in_fd;
- crd_outfd = out_fd;
+
+ if (!deco) {
+ pr_emerg("Invalid ramdisk decompression routine. "
+ "Select appropriate config option.\n");
+ panic("Could not decompress initial ramdisk image.");
+ }
+
result = deco(NULL, 0, compr_fill, compr_flush, NULL, NULL, error);
if (decompress_error)
result = 1;
diff --git a/init/init_task.c b/init/init_task.c
index ba0a7f362d9e..49b13d7c3985 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -1,26 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/init_task.h>
#include <linux/export.h>
#include <linux/mqueue.h>
#include <linux/sched.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
+#include <linux/sched/task.h>
+#include <linux/sched/ext.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/audit.h>
+#include <linux/numa.h>
+#include <linux/scs.h>
+#include <linux/plist.h>
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
-static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
-static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
+static struct signal_struct init_signals = {
+ .nr_threads = 1,
+ .thread_head = LIST_HEAD_INIT(init_task.thread_node),
+ .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(init_signals.wait_chldexit),
+ .shared_pending = {
+ .list = LIST_HEAD_INIT(init_signals.shared_pending.list),
+ .signal = {{0}}
+ },
+ .multiprocess = HLIST_HEAD_INIT,
+ .rlim = INIT_RLIMITS,
+#ifdef CONFIG_CGROUPS
+ .cgroup_threadgroup_rwsem = __RWSEM_INITIALIZER(init_signals.cgroup_threadgroup_rwsem),
+#endif
+ .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
+ .exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock),
+#ifdef CONFIG_POSIX_TIMERS
+ .posix_timers = HLIST_HEAD_INIT,
+ .ignored_posix_timers = HLIST_HEAD_INIT,
+ .cputimer = {
+ .cputime_atomic = INIT_CPUTIME_ATOMIC,
+ },
+#endif
+ INIT_CPU_TIMERS(init_signals)
+ .pids = {
+ [PIDTYPE_PID] = &init_struct_pid,
+ [PIDTYPE_TGID] = &init_struct_pid,
+ [PIDTYPE_PGID] = &init_struct_pid,
+ [PIDTYPE_SID] = &init_struct_pid,
+ },
+ INIT_PREV_CPUTIME(init_signals)
+};
-/* Initial task structure */
-struct task_struct init_task = INIT_TASK(init_task);
+static struct sighand_struct init_sighand = {
+ .count = REFCOUNT_INIT(1),
+ .action = { { { .sa_handler = SIG_DFL, } }, },
+ .siglock = __SPIN_LOCK_UNLOCKED(init_sighand.siglock),
+ .signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh),
+};
+
+#ifdef CONFIG_SHADOW_CALL_STACK
+unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] = {
+ [(SCS_SIZE / sizeof(long)) - 1] = SCS_END_MAGIC
+};
+#endif
+
+/* init to 2 - one for init_task, one to ensure it is never freed */
+static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) };
+
+/*
+ * The initial credentials for the initial task
+ */
+static struct cred init_cred = {
+ .usage = ATOMIC_INIT(4),
+ .uid = GLOBAL_ROOT_UID,
+ .gid = GLOBAL_ROOT_GID,
+ .suid = GLOBAL_ROOT_UID,
+ .sgid = GLOBAL_ROOT_GID,
+ .euid = GLOBAL_ROOT_UID,
+ .egid = GLOBAL_ROOT_GID,
+ .fsuid = GLOBAL_ROOT_UID,
+ .fsgid = GLOBAL_ROOT_GID,
+ .securebits = SECUREBITS_DEFAULT,
+ .cap_inheritable = CAP_EMPTY_SET,
+ .cap_permitted = CAP_FULL_SET,
+ .cap_effective = CAP_FULL_SET,
+ .cap_bset = CAP_FULL_SET,
+ .user = INIT_USER,
+ .user_ns = &init_user_ns,
+ .group_info = &init_groups,
+ .ucounts = &init_ucounts,
+};
+
+/*
+ * Set up the first task table, touch at your own risk!. Base=0,
+ * limit=0x1fffff (=2MB)
+ */
+struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ .thread_info = INIT_THREAD_INFO(init_task),
+ .stack_refcount = REFCOUNT_INIT(1),
+#endif
+ .__state = 0,
+ .stack = init_stack,
+ .usage = REFCOUNT_INIT(2),
+ .flags = PF_KTHREAD,
+ .prio = MAX_PRIO - 20,
+ .static_prio = MAX_PRIO - 20,
+ .normal_prio = MAX_PRIO - 20,
+ .policy = SCHED_NORMAL,
+ .cpus_ptr = &init_task.cpus_mask,
+ .user_cpus_ptr = NULL,
+ .cpus_mask = CPU_MASK_ALL,
+ .max_allowed_capacity = SCHED_CAPACITY_SCALE,
+ .nr_cpus_allowed= NR_CPUS,
+ .mm = NULL,
+ .active_mm = &init_mm,
+ .faults_disabled_mapping = NULL,
+ .restart_block = {
+ .fn = do_no_restart_syscall,
+ },
+ .se = {
+ .group_node = LIST_HEAD_INIT(init_task.se.group_node),
+ },
+ .rt = {
+ .run_list = LIST_HEAD_INIT(init_task.rt.run_list),
+ .time_slice = RR_TIMESLICE,
+ },
+ .tasks = LIST_HEAD_INIT(init_task.tasks),
+#ifdef CONFIG_SMP
+ .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO),
+#endif
+#ifdef CONFIG_CGROUP_SCHED
+ .sched_task_group = &root_task_group,
+#endif
+#ifdef CONFIG_SCHED_CLASS_EXT
+ .scx = {
+ .dsq_list.node = LIST_HEAD_INIT(init_task.scx.dsq_list.node),
+ .sticky_cpu = -1,
+ .holding_cpu = -1,
+ .runnable_node = LIST_HEAD_INIT(init_task.scx.runnable_node),
+ .runnable_at = INITIAL_JIFFIES,
+ .ddsp_dsq_id = SCX_DSQ_INVALID,
+ .slice = SCX_SLICE_DFL,
+ },
+#endif
+ .ptraced = LIST_HEAD_INIT(init_task.ptraced),
+ .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry),
+ .real_parent = &init_task,
+ .parent = &init_task,
+ .children = LIST_HEAD_INIT(init_task.children),
+ .sibling = LIST_HEAD_INIT(init_task.sibling),
+ .group_leader = &init_task,
+ RCU_POINTER_INITIALIZER(real_cred, &init_cred),
+ RCU_POINTER_INITIALIZER(cred, &init_cred),
+ .comm = INIT_TASK_COMM,
+ .thread = INIT_THREAD,
+ .fs = &init_fs,
+ .files = &init_files,
+#ifdef CONFIG_IO_URING
+ .io_uring = NULL,
+#endif
+ .signal = &init_signals,
+ .sighand = &init_sighand,
+ .nsproxy = &init_nsproxy,
+ .pending = {
+ .list = LIST_HEAD_INIT(init_task.pending.list),
+ .signal = {{0}}
+ },
+ .blocked = {{0}},
+ .alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock),
+ .journal_info = NULL,
+ INIT_CPU_TIMERS(init_task)
+ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
+ .timer_slack_ns = 50000, /* 50 usec default slack */
+ .thread_pid = &init_struct_pid,
+ .thread_node = LIST_HEAD_INIT(init_signals.thread_head),
+#ifdef CONFIG_AUDIT
+ .loginuid = INVALID_UID,
+ .sessionid = AUDIT_SID_UNSET,
+#endif
+#ifdef CONFIG_PERF_EVENTS
+ .perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex),
+ .perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list),
+#endif
+#ifdef CONFIG_PREEMPT_RCU
+ .rcu_read_lock_nesting = 0,
+ .rcu_read_unlock_special.s = 0,
+ .rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry),
+ .rcu_blocked_node = NULL,
+#endif
+#ifdef CONFIG_TASKS_RCU
+ .rcu_tasks_holdout = false,
+ .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list),
+ .rcu_tasks_idle_cpu = -1,
+ .rcu_tasks_exit_list = LIST_HEAD_INIT(init_task.rcu_tasks_exit_list),
+#endif
+#ifdef CONFIG_TASKS_TRACE_RCU
+ .trc_reader_nesting = 0,
+ .trc_reader_special.s = 0,
+ .trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list),
+ .trc_blkd_node = LIST_HEAD_INIT(init_task.trc_blkd_node),
+#endif
+#ifdef CONFIG_CPUSETS
+ .mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq,
+ &init_task.alloc_lock),
+#endif
+#ifdef CONFIG_RT_MUTEXES
+ .pi_waiters = RB_ROOT_CACHED,
+ .pi_top_task = NULL,
+#endif
+ INIT_PREV_CPUTIME(init_task)
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+ .vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount),
+ .vtime.starttime = 0,
+ .vtime.state = VTIME_SYS,
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ .numa_preferred_nid = NUMA_NO_NODE,
+ .numa_group = NULL,
+ .numa_faults = NULL,
+#endif
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+ .kasan_depth = 1,
+#endif
+#ifdef CONFIG_KCSAN
+ .kcsan_ctx = {
+ .scoped_accesses = {LIST_POISON1, NULL},
+ },
+#endif
+#ifdef CONFIG_TRACE_IRQFLAGS
+ .softirqs_enabled = 1,
+#endif
+#ifdef CONFIG_LOCKDEP
+ .lockdep_depth = 0, /* no locks held yet */
+ .curr_chain_key = INITIAL_CHAIN_KEY,
+ .lockdep_recursion = 0,
+#endif
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ .ret_stack = NULL,
+ .tracing_graph_pause = ATOMIC_INIT(0),
+#endif
+#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPTION)
+ .trace_recursion = 0,
+#endif
+#ifdef CONFIG_LIVEPATCH
+ .patch_state = KLP_TRANSITION_IDLE,
+#endif
+#ifdef CONFIG_SECURITY
+ .security = NULL,
+#endif
+#ifdef CONFIG_SECCOMP_FILTER
+ .seccomp = { .filter_count = ATOMIC_INIT(0) },
+#endif
+#ifdef CONFIG_SCHED_MM_CID
+ .mm_cid = { .cid = MM_CID_UNSET, },
+#endif
+};
EXPORT_SYMBOL(init_task);
/*
* Initial thread structure. Alignment of this is handled by a special
* linker map entry.
*/
-union thread_union init_thread_union __init_task_data =
- { INIT_THREAD_INFO(init_task) };
+#ifndef CONFIG_THREAD_INFO_IN_TASK
+struct thread_info init_thread_info __init_thread_info = INIT_THREAD_INFO(init_task);
+#endif
diff --git a/init/initramfs.c b/init/initramfs.c
index a67ef9dbda9d..6ddbfb17fb8f 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -1,14 +1,7 @@
-/*
- * Many of the syscalls used in this file expect some of the arguments
- * to be __user pointers not __kernel pointers. To limit the sparse
- * noise, turn off sparse checking for this file.
- */
-#ifdef __CHECKER__
-#undef __CHECKER__
-#warning "Sparse checking disabled for this file"
-#endif
-
+// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
+#include <linux/async.h>
+#include <linux/export.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/types.h>
@@ -18,6 +11,52 @@
#include <linux/dirent.h>
#include <linux/syscalls.h>
#include <linux/utime.h>
+#include <linux/file.h>
+#include <linux/kstrtox.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/namei.h>
+#include <linux/init_syscalls.h>
+#include <linux/umh.h>
+#include <linux/security.h>
+#include <linux/overflow.h>
+
+#include "do_mounts.h"
+#include "initramfs_internal.h"
+
+static __initdata bool csum_present;
+static __initdata u32 io_csum;
+
+static ssize_t __init xwrite(struct file *file, const unsigned char *p,
+ size_t count, loff_t *pos)
+{
+ ssize_t out = 0;
+
+ /* sys_write only can write MAX_RW_COUNT aka 2G-4K bytes at most */
+ while (count) {
+ ssize_t rv = kernel_write(file, p, count, pos);
+
+ if (rv < 0) {
+ if (rv == -EINTR || rv == -EAGAIN)
+ continue;
+ return out ? out : rv;
+ } else if (rv == 0)
+ break;
+
+ if (csum_present) {
+ ssize_t i;
+
+ for (i = 0; i < rv; i++)
+ io_csum += p[i];
+ }
+
+ p += rv;
+ out += rv;
+ count -= rv;
+ }
+
+ return out;
+}
static __initdata char *message;
static void __init error(char *x)
@@ -26,6 +65,9 @@ static void __init error(char *x)
message = x;
}
+#define panic_show_mem(fmt, ...) \
+ ({ show_mem(); panic(fmt, ##__VA_ARGS__); })
+
/* link hash */
#define N_ALIGN(len) ((((len) + 1) & ~3) + 2)
@@ -36,6 +78,7 @@ static __initdata struct hash {
struct hash *next;
char name[N_ALIGN(PATH_MAX)];
} *head[32];
+static __initdata bool hardlink_seen;
static inline int hash(int major, int minor, int ino)
{
@@ -61,55 +104,60 @@ static char __init *find_link(int major, int minor, int ino,
}
q = kmalloc(sizeof(struct hash), GFP_KERNEL);
if (!q)
- panic("can't allocate link hash entry");
+ panic_show_mem("can't allocate link hash entry");
q->major = major;
q->minor = minor;
q->ino = ino;
q->mode = mode;
- strcpy(q->name, name);
+ strscpy(q->name, name);
q->next = NULL;
*p = q;
+ hardlink_seen = true;
return NULL;
}
static void __init free_hash(void)
{
struct hash **p, *q;
- for (p = head; p < head + 32; p++) {
+ for (p = head; hardlink_seen && p < head + 32; p++) {
while (*p) {
q = *p;
*p = q->next;
kfree(q);
}
}
+ hardlink_seen = false;
}
-static long __init do_utime(char *filename, time_t mtime)
+#ifdef CONFIG_INITRAMFS_PRESERVE_MTIME
+static void __init do_utime(char *filename, time64_t mtime)
{
- struct timespec t[2];
-
- t[0].tv_sec = mtime;
- t[0].tv_nsec = 0;
- t[1].tv_sec = mtime;
- t[1].tv_nsec = 0;
+ struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } };
+ init_utimes(filename, t);
+}
- return do_utimes(AT_FDCWD, filename, t, AT_SYMLINK_NOFOLLOW);
+static void __init do_utime_path(const struct path *path, time64_t mtime)
+{
+ struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } };
+ vfs_utimes(path, t);
}
static __initdata LIST_HEAD(dir_list);
struct dir_entry {
struct list_head list;
- char *name;
- time_t mtime;
+ time64_t mtime;
+ char name[];
};
-static void __init dir_add(const char *name, time_t mtime)
+static void __init dir_add(const char *name, size_t nlen, time64_t mtime)
{
- struct dir_entry *de = kmalloc(sizeof(struct dir_entry), GFP_KERNEL);
+ struct dir_entry *de;
+
+ de = kmalloc(struct_size(de, name, nlen), GFP_KERNEL);
if (!de)
- panic("can't allocate dir_entry buffer");
+ panic_show_mem("can't allocate dir_entry buffer");
INIT_LIST_HEAD(&de->list);
- de->name = kstrdup(name, GFP_KERNEL);
+ strscpy(de->name, name, nlen);
de->mtime = mtime;
list_add(&de->list, &dir_list);
}
@@ -120,12 +168,17 @@ static void __init dir_utime(void)
list_for_each_entry_safe(de, tmp, &dir_list, list) {
list_del(&de->list);
do_utime(de->name, de->mtime);
- kfree(de->name);
kfree(de);
}
}
+#else
+static void __init do_utime(char *filename, time64_t mtime) {}
+static void __init do_utime_path(const struct path *path, time64_t mtime) {}
+static void __init dir_add(const char *name, size_t nlen, time64_t mtime) {}
+static void __init dir_utime(void) {}
+#endif
-static __initdata time_t mtime;
+static __initdata time64_t mtime;
/* cpio header parsing */
@@ -135,29 +188,28 @@ static __initdata unsigned long body_len, name_len;
static __initdata uid_t uid;
static __initdata gid_t gid;
static __initdata unsigned rdev;
+static __initdata u32 hdr_csum;
static void __init parse_header(char *s)
{
- unsigned long parsed[12];
- char buf[9];
+ unsigned long parsed[13];
int i;
- buf[8] = '\0';
- for (i = 0, s += 6; i < 12; i++, s += 8) {
- memcpy(buf, s, 8);
- parsed[i] = simple_strtoul(buf, NULL, 16);
- }
+ for (i = 0, s += 6; i < 13; i++, s += 8)
+ parsed[i] = simple_strntoul(s, NULL, 16, 8);
+
ino = parsed[0];
mode = parsed[1];
uid = parsed[2];
gid = parsed[3];
nlink = parsed[4];
- mtime = parsed[5];
+ mtime = parsed[5]; /* breaks in y2106 */
body_len = parsed[6];
major = parsed[7];
minor = parsed[8];
rdev = new_encode_dev(MKDEV(parsed[9], parsed[10]));
name_len = parsed[11];
+ hdr_csum = parsed[12];
}
/* FSM */
@@ -174,24 +226,23 @@ static __initdata enum state {
} state, next_state;
static __initdata char *victim;
-static __initdata unsigned count;
+static unsigned long byte_count __initdata;
static __initdata loff_t this_header, next_header;
static inline void __init eat(unsigned n)
{
victim += n;
this_header += n;
- count -= n;
+ byte_count -= n;
}
-static __initdata char *vcollected;
static __initdata char *collected;
-static __initdata int remains;
+static long remains __initdata;
static __initdata char *collect;
static void __init read_into(char *buf, unsigned size, enum state next)
{
- if (count >= size) {
+ if (byte_count >= size) {
collected = victim;
eat(size);
state = next;
@@ -207,15 +258,15 @@ static __initdata char *header_buf, *symlink_buf, *name_buf;
static int __init do_start(void)
{
- read_into(header_buf, 110, GotHeader);
+ read_into(header_buf, CPIO_HDRLEN, GotHeader);
return 0;
}
static int __init do_collect(void)
{
- unsigned n = remains;
- if (count < n)
- n = count;
+ unsigned long n = remains;
+ if (byte_count < n)
+ n = byte_count;
memcpy(collect, victim, n);
eat(n);
collect += n;
@@ -227,12 +278,15 @@ static int __init do_collect(void)
static int __init do_header(void)
{
- if (memcmp(collected, "070707", 6)==0) {
- error("incorrect cpio method used: use -H newc option");
- return 1;
- }
- if (memcmp(collected, "070701", 6)) {
- error("no cpio magic");
+ if (!memcmp(collected, "070701", 6)) {
+ csum_present = false;
+ } else if (!memcmp(collected, "070702", 6)) {
+ csum_present = true;
+ } else {
+ if (memcmp(collected, "070707", 6) == 0)
+ error("incorrect cpio method used: use -H newc option");
+ else
+ error("no cpio magic");
return 1;
}
parse_header(collected);
@@ -257,8 +311,8 @@ static int __init do_header(void)
static int __init do_skip(void)
{
- if (this_header + count < next_header) {
- eat(count);
+ if (this_header + byte_count < next_header) {
+ eat(byte_count);
return 1;
} else {
eat(next_header - this_header);
@@ -269,41 +323,54 @@ static int __init do_skip(void)
static int __init do_reset(void)
{
- while(count && *victim == '\0')
+ while (byte_count && *victim == '\0')
eat(1);
- if (count && (this_header & 3))
+ if (byte_count && (this_header & 3))
error("broken padding");
return 1;
}
-static int __init maybe_link(void)
+static void __init clean_path(char *path, umode_t fmode)
{
- if (nlink >= 2) {
- char *old = find_link(major, minor, ino, mode, collected);
- if (old)
- return (sys_link(old, collected) < 0) ? -1 : 1;
+ struct kstat st;
+
+ if (!init_stat(path, &st, AT_SYMLINK_NOFOLLOW) &&
+ (st.mode ^ fmode) & S_IFMT) {
+ if (S_ISDIR(st.mode))
+ init_rmdir(path);
+ else
+ init_unlink(path);
}
- return 0;
}
-static void __init clean_path(char *path, umode_t mode)
+static int __init maybe_link(void)
{
- struct stat st;
-
- if (!sys_newlstat(path, &st) && (st.st_mode^mode) & S_IFMT) {
- if (S_ISDIR(st.st_mode))
- sys_rmdir(path);
- else
- sys_unlink(path);
+ if (nlink >= 2) {
+ char *old = find_link(major, minor, ino, mode, collected);
+ if (old) {
+ clean_path(collected, 0);
+ return (init_link(old, collected) < 0) ? -1 : 1;
+ }
}
+ return 0;
}
-static __initdata int wfd;
+static __initdata struct file *wfile;
+static __initdata loff_t wfile_pos;
static int __init do_name(void)
{
state = SkipIt;
next_state = Reset;
+
+ /* name_len > 0 && name_len <= PATH_MAX checked in do_header */
+ if (collected[name_len - 1] != '\0') {
+ pr_err("initramfs name without nulterm: %.*s\n",
+ (int)name_len, collected);
+ error("malformed archive");
+ return 1;
+ }
+
if (strcmp(collected, "TRAILER!!!") == 0) {
free_hash();
return 0;
@@ -312,31 +379,32 @@ static int __init do_name(void)
if (S_ISREG(mode)) {
int ml = maybe_link();
if (ml >= 0) {
- int openflags = O_WRONLY|O_CREAT;
+ int openflags = O_WRONLY|O_CREAT|O_LARGEFILE;
if (ml != 1)
openflags |= O_TRUNC;
- wfd = sys_open(collected, openflags, mode);
-
- if (wfd >= 0) {
- sys_fchown(wfd, uid, gid);
- sys_fchmod(wfd, mode);
- if (body_len)
- sys_ftruncate(wfd, body_len);
- vcollected = kstrdup(collected, GFP_KERNEL);
- state = CopyFile;
- }
+ wfile = filp_open(collected, openflags, mode);
+ if (IS_ERR(wfile))
+ return 0;
+ wfile_pos = 0;
+ io_csum = 0;
+
+ vfs_fchown(wfile, uid, gid);
+ vfs_fchmod(wfile, mode);
+ if (body_len)
+ vfs_truncate(&wfile->f_path, body_len);
+ state = CopyFile;
}
} else if (S_ISDIR(mode)) {
- sys_mkdir(collected, mode);
- sys_chown(collected, uid, gid);
- sys_chmod(collected, mode);
- dir_add(collected, mtime);
+ init_mkdir(collected, mode);
+ init_chown(collected, uid, gid, 0);
+ init_chmod(collected, mode);
+ dir_add(collected, name_len, mtime);
} else if (S_ISBLK(mode) || S_ISCHR(mode) ||
S_ISFIFO(mode) || S_ISSOCK(mode)) {
if (maybe_link() == 0) {
- sys_mknod(collected, mode, rdev);
- sys_chown(collected, uid, gid);
- sys_chmod(collected, mode);
+ init_mknod(collected, mode, rdev);
+ init_chown(collected, uid, gid, 0);
+ init_chmod(collected, mode);
do_utime(collected, mtime);
}
}
@@ -345,28 +413,38 @@ static int __init do_name(void)
static int __init do_copy(void)
{
- if (count >= body_len) {
- sys_write(wfd, victim, body_len);
- sys_close(wfd);
- do_utime(vcollected, mtime);
- kfree(vcollected);
+ if (byte_count >= body_len) {
+ if (xwrite(wfile, victim, body_len, &wfile_pos) != body_len)
+ error("write error");
+
+ do_utime_path(&wfile->f_path, mtime);
+ fput(wfile);
+ if (csum_present && io_csum != hdr_csum)
+ error("bad data checksum");
eat(body_len);
state = SkipIt;
return 0;
} else {
- sys_write(wfd, victim, count);
- body_len -= count;
- eat(count);
+ if (xwrite(wfile, victim, byte_count, &wfile_pos) != byte_count)
+ error("write error");
+ body_len -= byte_count;
+ eat(byte_count);
return 1;
}
}
static int __init do_symlink(void)
{
+ if (collected[name_len - 1] != '\0') {
+ pr_err("initramfs symlink without nulterm: %.*s\n",
+ (int)name_len, collected);
+ error("malformed archive");
+ return 1;
+ }
collected[N_ALIGN(name_len) + body_len] = '\0';
clean_path(collected, 0);
- sys_symlink(collected + N_ALIGN(name_len), collected);
- sys_lchown(collected, uid, gid);
+ init_symlink(collected + N_ALIGN(name_len), collected);
+ init_chown(collected, uid, gid, AT_SYMLINK_NOFOLLOW);
do_utime(collected, mtime);
state = SkipIt;
next_state = Reset;
@@ -384,21 +462,21 @@ static __initdata int (*actions[])(void) = {
[Reset] = do_reset,
};
-static int __init write_buffer(char *buf, unsigned len)
+static long __init write_buffer(char *buf, unsigned long len)
{
- count = len;
+ byte_count = len;
victim = buf;
while (!actions[state]())
;
- return len - count;
+ return len - byte_count;
}
-static int __init flush_buffer(void *bufv, unsigned len)
+static long __init flush_buffer(void *bufv, unsigned long len)
{
- char *buf = (char *) bufv;
- int written;
- int origLen = len;
+ char *buf = bufv;
+ long written;
+ long origLen = len;
if (message)
return -1;
while ((written = write_buffer(buf, len)) < len && !message) {
@@ -412,28 +490,41 @@ static int __init flush_buffer(void *bufv, unsigned len)
len -= written;
state = Reset;
} else
- error("junk in compressed archive");
+ error("junk within compressed archive");
}
return origLen;
}
-static unsigned my_inptr; /* index of next byte to be processed in inbuf */
+static unsigned long my_inptr __initdata; /* index of next byte to be processed in inbuf */
#include <linux/decompress/generic.h>
-static char * __init unpack_to_rootfs(char *buf, unsigned len)
+/**
+ * unpack_to_rootfs - decompress and extract an initramfs archive
+ * @buf: input initramfs archive to extract
+ * @len: length of initramfs data to process
+ *
+ * Returns: NULL for success or an error message string
+ *
+ * This symbol shouldn't be used externally. It's available for unit tests.
+ */
+char * __init unpack_to_rootfs(char *buf, unsigned long len)
{
- int written, res;
+ long written;
decompress_fn decompress;
const char *compress_name;
- static __initdata char msg_buf[64];
+ struct {
+ char header[CPIO_HDRLEN];
+ char symlink[PATH_MAX + N_ALIGN(PATH_MAX) + 1];
+ char name[N_ALIGN(PATH_MAX)];
+ } *bufs = kmalloc(sizeof(*bufs), GFP_KERNEL);
- header_buf = kmalloc(110, GFP_KERNEL);
- symlink_buf = kmalloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1, GFP_KERNEL);
- name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL);
+ if (!bufs)
+ panic_show_mem("can't allocate buffers");
- if (!header_buf || !symlink_buf || !name_buf)
- panic("can't allocate buffers");
+ header_buf = bufs->header;
+ symlink_buf = bufs->symlink;
+ name_buf = bufs->name;
state = Start;
this_header = 0;
@@ -455,30 +546,28 @@ static char * __init unpack_to_rootfs(char *buf, unsigned len)
}
this_header = 0;
decompress = decompress_method(buf, len, &compress_name);
+ pr_debug("Detected %s compressed data\n", compress_name);
if (decompress) {
- res = decompress(buf, len, NULL, flush_buffer, NULL,
+ int res = decompress(buf, len, NULL, flush_buffer, NULL,
&my_inptr, error);
if (res)
error("decompressor failed");
} else if (compress_name) {
- if (!message) {
- snprintf(msg_buf, sizeof msg_buf,
- "compression method %s not configured",
- compress_name);
- message = msg_buf;
- }
+ pr_err("compression method %s not configured\n",
+ compress_name);
+ error("decompressor failed");
} else
- error("junk in compressed archive");
+ error("invalid magic at start of compressed archive");
if (state != Reset)
- error("junk in compressed archive");
+ error("junk at the end of compressed archive");
this_header = saved_offset + my_inptr;
buf += my_inptr;
len -= my_inptr;
}
dir_utime();
- kfree(name_buf);
- kfree(symlink_buf);
- kfree(header_buf);
+ /* free any hardlink state collected without optional TRAILER!!! */
+ free_hash();
+ kfree(bufs);
return message;
}
@@ -493,135 +582,210 @@ static int __init retain_initrd_param(char *str)
}
__setup("retain_initrd", retain_initrd_param);
+#ifdef CONFIG_ARCH_HAS_KEEPINITRD
+static int __init keepinitrd_setup(char *__unused)
+{
+ do_retain_initrd = 1;
+ return 1;
+}
+__setup("keepinitrd", keepinitrd_setup);
+#endif
+
+static bool __initdata initramfs_async = true;
+static int __init initramfs_async_setup(char *str)
+{
+ return kstrtobool(str, &initramfs_async) == 0;
+}
+__setup("initramfs_async=", initramfs_async_setup);
+
extern char __initramfs_start[];
extern unsigned long __initramfs_size;
#include <linux/initrd.h>
#include <linux/kexec.h>
-static void __init free_initrd(void)
+static BIN_ATTR(initrd, 0440, sysfs_bin_attr_simple_read, NULL, 0);
+
+void __init reserve_initrd_mem(void)
+{
+ phys_addr_t start;
+ unsigned long size;
+
+ /* Ignore the virtul address computed during device tree parsing */
+ initrd_start = initrd_end = 0;
+
+ if (!phys_initrd_size)
+ return;
+ /*
+ * Round the memory region to page boundaries as per free_initrd_mem()
+ * This allows us to detect whether the pages overlapping the initrd
+ * are in use, but more importantly, reserves the entire set of pages
+ * as we don't want these pages allocated for other purposes.
+ */
+ start = round_down(phys_initrd_start, PAGE_SIZE);
+ size = phys_initrd_size + (phys_initrd_start - start);
+ size = round_up(size, PAGE_SIZE);
+
+ if (!memblock_is_region_memory(start, size)) {
+ pr_err("INITRD: 0x%08llx+0x%08lx is not a memory region",
+ (u64)start, size);
+ goto disable;
+ }
+
+ if (memblock_is_region_reserved(start, size)) {
+ pr_err("INITRD: 0x%08llx+0x%08lx overlaps in-use memory region\n",
+ (u64)start, size);
+ goto disable;
+ }
+
+ memblock_reserve(start, size);
+ /* Now convert initrd to virtual addresses */
+ initrd_start = (unsigned long)__va(phys_initrd_start);
+ initrd_end = initrd_start + phys_initrd_size;
+ initrd_below_start_ok = 1;
+
+ return;
+disable:
+ pr_cont(" - disabling initrd\n");
+ initrd_start = 0;
+ initrd_end = 0;
+}
+
+void __weak __init free_initrd_mem(unsigned long start, unsigned long end)
+{
+#ifdef CONFIG_ARCH_KEEP_MEMBLOCK
+ unsigned long aligned_start = ALIGN_DOWN(start, PAGE_SIZE);
+ unsigned long aligned_end = ALIGN(end, PAGE_SIZE);
+
+ memblock_free((void *)aligned_start, aligned_end - aligned_start);
+#endif
+
+ free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
+ "initrd");
+}
+
+#ifdef CONFIG_CRASH_RESERVE
+static bool __init kexec_free_initrd(void)
{
-#ifdef CONFIG_KEXEC
unsigned long crashk_start = (unsigned long)__va(crashk_res.start);
unsigned long crashk_end = (unsigned long)__va(crashk_res.end);
-#endif
- if (do_retain_initrd)
- goto skip;
-#ifdef CONFIG_KEXEC
/*
* If the initrd region is overlapped with crashkernel reserved region,
* free only memory that is not part of crashkernel region.
*/
- if (initrd_start < crashk_end && initrd_end > crashk_start) {
- /*
- * Initialize initrd memory region since the kexec boot does
- * not do.
- */
- memset((void *)initrd_start, 0, initrd_end - initrd_start);
- if (initrd_start < crashk_start)
- free_initrd_mem(initrd_start, crashk_start);
- if (initrd_end > crashk_end)
- free_initrd_mem(crashk_end, initrd_end);
- } else
-#endif
- free_initrd_mem(initrd_start, initrd_end);
-skip:
- initrd_start = 0;
- initrd_end = 0;
+ if (initrd_start >= crashk_end || initrd_end <= crashk_start)
+ return false;
+
+ /*
+ * Initialize initrd memory region since the kexec boot does not do.
+ */
+ memset((void *)initrd_start, 0, initrd_end - initrd_start);
+ if (initrd_start < crashk_start)
+ free_initrd_mem(initrd_start, crashk_start);
+ if (initrd_end > crashk_end)
+ free_initrd_mem(crashk_end, initrd_end);
+ return true;
+}
+#else
+static inline bool kexec_free_initrd(void)
+{
+ return false;
}
+#endif /* CONFIG_KEXEC_CORE */
#ifdef CONFIG_BLK_DEV_RAM
-#define BUF_SIZE 1024
-static void __init clean_rootfs(void)
+static void __init populate_initrd_image(char *err)
{
- int fd;
- void *buf;
- struct linux_dirent64 *dirp;
- int num;
-
- fd = sys_open("/", O_RDONLY, 0);
- WARN_ON(fd < 0);
- if (fd < 0)
- return;
- buf = kzalloc(BUF_SIZE, GFP_KERNEL);
- WARN_ON(!buf);
- if (!buf) {
- sys_close(fd);
+ ssize_t written;
+ struct file *file;
+ loff_t pos = 0;
+
+ printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n",
+ err);
+ file = filp_open("/initrd.image", O_WRONLY|O_CREAT|O_LARGEFILE, 0700);
+ if (IS_ERR(file))
return;
- }
-
- dirp = buf;
- num = sys_getdents64(fd, dirp, BUF_SIZE);
- while (num > 0) {
- while (num > 0) {
- struct stat st;
- int ret;
-
- ret = sys_newlstat(dirp->d_name, &st);
- WARN_ON_ONCE(ret);
- if (!ret) {
- if (S_ISDIR(st.st_mode))
- sys_rmdir(dirp->d_name);
- else
- sys_unlink(dirp->d_name);
- }
-
- num -= dirp->d_reclen;
- dirp = (void *)dirp + dirp->d_reclen;
- }
- dirp = buf;
- memset(buf, 0, BUF_SIZE);
- num = sys_getdents64(fd, dirp, BUF_SIZE);
- }
- sys_close(fd);
- kfree(buf);
+ written = xwrite(file, (char *)initrd_start, initrd_end - initrd_start,
+ &pos);
+ if (written != initrd_end - initrd_start)
+ pr_err("/initrd.image: incomplete write (%zd != %ld)\n",
+ written, initrd_end - initrd_start);
+ fput(file);
}
-#endif
+#endif /* CONFIG_BLK_DEV_RAM */
-static int __init populate_rootfs(void)
+static void __init do_populate_rootfs(void *unused, async_cookie_t cookie)
{
+ /* Load the built in initramfs */
char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
if (err)
- panic(err); /* Failed to decompress INTERNAL initramfs */
- if (initrd_start) {
-#ifdef CONFIG_BLK_DEV_RAM
- int fd;
+ panic_show_mem("%s", err); /* Failed to decompress INTERNAL initramfs */
+
+ if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE))
+ goto done;
+
+ if (IS_ENABLED(CONFIG_BLK_DEV_RAM))
printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n");
- err = unpack_to_rootfs((char *)initrd_start,
- initrd_end - initrd_start);
- if (!err) {
- free_initrd();
- goto done;
- } else {
- clean_rootfs();
- unpack_to_rootfs(__initramfs_start, __initramfs_size);
- }
- printk(KERN_INFO "rootfs image is not initramfs (%s)"
- "; looks like an initrd\n", err);
- fd = sys_open("/initrd.image",
- O_WRONLY|O_CREAT, 0700);
- if (fd >= 0) {
- sys_write(fd, (char *)initrd_start,
- initrd_end - initrd_start);
- sys_close(fd);
- free_initrd();
- }
- done:
-#else
+ else
printk(KERN_INFO "Unpacking initramfs...\n");
- err = unpack_to_rootfs((char *)initrd_start,
- initrd_end - initrd_start);
- if (err)
- printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err);
- free_initrd();
+
+ err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start);
+ if (err) {
+#ifdef CONFIG_BLK_DEV_RAM
+ populate_initrd_image(err);
+#else
+ printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err);
#endif
+ }
+
+done:
+ security_initramfs_populated();
+
+ /*
+ * If the initrd region is overlapped with crashkernel reserved region,
+ * free only memory that is not part of crashkernel region.
+ */
+ if (!do_retain_initrd && initrd_start && !kexec_free_initrd()) {
+ free_initrd_mem(initrd_start, initrd_end);
+ } else if (do_retain_initrd && initrd_start) {
+ bin_attr_initrd.size = initrd_end - initrd_start;
+ bin_attr_initrd.private = (void *)initrd_start;
+ if (sysfs_create_bin_file(firmware_kobj, &bin_attr_initrd))
+ pr_err("Failed to create initrd sysfs file");
+ }
+ initrd_start = 0;
+ initrd_end = 0;
+
+ init_flush_fput();
+}
+
+static ASYNC_DOMAIN_EXCLUSIVE(initramfs_domain);
+static async_cookie_t initramfs_cookie;
+
+void wait_for_initramfs(void)
+{
+ if (!initramfs_cookie) {
/*
- * Try loading default modules from initramfs. This gives
- * us a chance to load before device_initcalls.
+ * Something before rootfs_initcall wants to access
+ * the filesystem/initramfs. Probably a bug. Make a
+ * note, avoid deadlocking the machine, and let the
+ * caller's access fail as it used to.
*/
- load_default_modules();
+ pr_warn_once("wait_for_initramfs() called before rootfs_initcalls\n");
+ return;
}
+ async_synchronize_cookie_domain(initramfs_cookie + 1, &initramfs_domain);
+}
+EXPORT_SYMBOL_GPL(wait_for_initramfs);
+
+static int __init populate_rootfs(void)
+{
+ initramfs_cookie = async_schedule_domain(do_populate_rootfs, NULL,
+ &initramfs_domain);
+ usermodehelper_enable();
+ if (!initramfs_async)
+ wait_for_initramfs();
return 0;
}
rootfs_initcall(populate_rootfs);
diff --git a/init/initramfs_internal.h b/init/initramfs_internal.h
new file mode 100644
index 000000000000..233dad16b0a0
--- /dev/null
+++ b/init/initramfs_internal.h
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __INITRAMFS_INTERNAL_H__
+#define __INITRAMFS_INTERNAL_H__
+
+char *unpack_to_rootfs(char *buf, unsigned long len);
+#define CPIO_HDRLEN 110
+
+#endif
diff --git a/init/initramfs_test.c b/init/initramfs_test.c
new file mode 100644
index 000000000000..5d2db455e60c
--- /dev/null
+++ b/init/initramfs_test.c
@@ -0,0 +1,472 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <kunit/test.h>
+#include <linux/fcntl.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init_syscalls.h>
+#include <linux/stringify.h>
+#include <linux/timekeeping.h>
+#include "initramfs_internal.h"
+
+struct initramfs_test_cpio {
+ char *magic;
+ unsigned int ino;
+ unsigned int mode;
+ unsigned int uid;
+ unsigned int gid;
+ unsigned int nlink;
+ unsigned int mtime;
+ unsigned int filesize;
+ unsigned int devmajor;
+ unsigned int devminor;
+ unsigned int rdevmajor;
+ unsigned int rdevminor;
+ unsigned int namesize;
+ unsigned int csum;
+ char *fname;
+ char *data;
+};
+
+static size_t fill_cpio(struct initramfs_test_cpio *cs, size_t csz, char *out)
+{
+ int i;
+ size_t off = 0;
+
+ for (i = 0; i < csz; i++) {
+ char *pos = &out[off];
+ struct initramfs_test_cpio *c = &cs[i];
+ size_t thislen;
+
+ /* +1 to account for nulterm */
+ thislen = sprintf(pos, "%s"
+ "%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x"
+ "%s",
+ c->magic, c->ino, c->mode, c->uid, c->gid, c->nlink,
+ c->mtime, c->filesize, c->devmajor, c->devminor,
+ c->rdevmajor, c->rdevminor, c->namesize, c->csum,
+ c->fname) + 1;
+
+ pr_debug("packing (%zu): %.*s\n", thislen, (int)thislen, pos);
+ if (thislen != CPIO_HDRLEN + c->namesize)
+ pr_debug("padded to: %u\n", CPIO_HDRLEN + c->namesize);
+ off += CPIO_HDRLEN + c->namesize;
+ while (off & 3)
+ out[off++] = '\0';
+
+ memcpy(&out[off], c->data, c->filesize);
+ off += c->filesize;
+ while (off & 3)
+ out[off++] = '\0';
+ }
+
+ return off;
+}
+
+static void __init initramfs_test_extract(struct kunit *test)
+{
+ char *err, *cpio_srcbuf;
+ size_t len;
+ struct timespec64 ts_before, ts_after;
+ struct kstat st = {};
+ struct initramfs_test_cpio c[] = { {
+ .magic = "070701",
+ .ino = 1,
+ .mode = S_IFREG | 0777,
+ .uid = 12,
+ .gid = 34,
+ .nlink = 1,
+ .mtime = 56,
+ .filesize = 0,
+ .devmajor = 0,
+ .devminor = 1,
+ .rdevmajor = 0,
+ .rdevminor = 0,
+ .namesize = sizeof("initramfs_test_extract"),
+ .csum = 0,
+ .fname = "initramfs_test_extract",
+ }, {
+ .magic = "070701",
+ .ino = 2,
+ .mode = S_IFDIR | 0777,
+ .nlink = 1,
+ .mtime = 57,
+ .devminor = 1,
+ .namesize = sizeof("initramfs_test_extract_dir"),
+ .fname = "initramfs_test_extract_dir",
+ }, {
+ .magic = "070701",
+ .namesize = sizeof("TRAILER!!!"),
+ .fname = "TRAILER!!!",
+ } };
+
+ /* +3 to cater for any 4-byte end-alignment */
+ cpio_srcbuf = kzalloc(ARRAY_SIZE(c) * (CPIO_HDRLEN + PATH_MAX + 3),
+ GFP_KERNEL);
+ len = fill_cpio(c, ARRAY_SIZE(c), cpio_srcbuf);
+
+ ktime_get_real_ts64(&ts_before);
+ err = unpack_to_rootfs(cpio_srcbuf, len);
+ ktime_get_real_ts64(&ts_after);
+ if (err) {
+ KUNIT_FAIL(test, "unpack failed %s", err);
+ goto out;
+ }
+
+ KUNIT_EXPECT_EQ(test, init_stat(c[0].fname, &st, 0), 0);
+ KUNIT_EXPECT_TRUE(test, S_ISREG(st.mode));
+ KUNIT_EXPECT_TRUE(test, uid_eq(st.uid, KUIDT_INIT(c[0].uid)));
+ KUNIT_EXPECT_TRUE(test, gid_eq(st.gid, KGIDT_INIT(c[0].gid)));
+ KUNIT_EXPECT_EQ(test, st.nlink, 1);
+ if (IS_ENABLED(CONFIG_INITRAMFS_PRESERVE_MTIME)) {
+ KUNIT_EXPECT_EQ(test, st.mtime.tv_sec, c[0].mtime);
+ } else {
+ KUNIT_EXPECT_GE(test, st.mtime.tv_sec, ts_before.tv_sec);
+ KUNIT_EXPECT_LE(test, st.mtime.tv_sec, ts_after.tv_sec);
+ }
+ KUNIT_EXPECT_EQ(test, st.blocks, c[0].filesize);
+
+ KUNIT_EXPECT_EQ(test, init_stat(c[1].fname, &st, 0), 0);
+ KUNIT_EXPECT_TRUE(test, S_ISDIR(st.mode));
+ if (IS_ENABLED(CONFIG_INITRAMFS_PRESERVE_MTIME)) {
+ KUNIT_EXPECT_EQ(test, st.mtime.tv_sec, c[1].mtime);
+ } else {
+ KUNIT_EXPECT_GE(test, st.mtime.tv_sec, ts_before.tv_sec);
+ KUNIT_EXPECT_LE(test, st.mtime.tv_sec, ts_after.tv_sec);
+ }
+
+ KUNIT_EXPECT_EQ(test, init_unlink(c[0].fname), 0);
+ KUNIT_EXPECT_EQ(test, init_rmdir(c[1].fname), 0);
+out:
+ kfree(cpio_srcbuf);
+}
+
+/*
+ * Don't terminate filename. Previously, the cpio filename field was passed
+ * directly to filp_open(collected, O_CREAT|..) without nulterm checks. See
+ * https://lore.kernel.org/linux-fsdevel/20241030035509.20194-2-ddiss@suse.de
+ */
+static void __init initramfs_test_fname_overrun(struct kunit *test)
+{
+ char *err, *cpio_srcbuf;
+ size_t len, suffix_off;
+ struct initramfs_test_cpio c[] = { {
+ .magic = "070701",
+ .ino = 1,
+ .mode = S_IFREG | 0777,
+ .uid = 0,
+ .gid = 0,
+ .nlink = 1,
+ .mtime = 1,
+ .filesize = 0,
+ .devmajor = 0,
+ .devminor = 1,
+ .rdevmajor = 0,
+ .rdevminor = 0,
+ .namesize = sizeof("initramfs_test_fname_overrun"),
+ .csum = 0,
+ .fname = "initramfs_test_fname_overrun",
+ } };
+
+ /*
+ * poison cpio source buffer, so we can detect overrun. source
+ * buffer is used by read_into() when hdr or fname
+ * are already available (e.g. no compression).
+ */
+ cpio_srcbuf = kmalloc(CPIO_HDRLEN + PATH_MAX + 3, GFP_KERNEL);
+ memset(cpio_srcbuf, 'B', CPIO_HDRLEN + PATH_MAX + 3);
+ /* limit overrun to avoid crashes / filp_open() ENAMETOOLONG */
+ cpio_srcbuf[CPIO_HDRLEN + strlen(c[0].fname) + 20] = '\0';
+
+ len = fill_cpio(c, ARRAY_SIZE(c), cpio_srcbuf);
+ /* overwrite trailing fname terminator and padding */
+ suffix_off = len - 1;
+ while (cpio_srcbuf[suffix_off] == '\0') {
+ cpio_srcbuf[suffix_off] = 'P';
+ suffix_off--;
+ }
+
+ err = unpack_to_rootfs(cpio_srcbuf, len);
+ KUNIT_EXPECT_NOT_NULL(test, err);
+
+ kfree(cpio_srcbuf);
+}
+
+static void __init initramfs_test_data(struct kunit *test)
+{
+ char *err, *cpio_srcbuf;
+ size_t len;
+ struct file *file;
+ struct initramfs_test_cpio c[] = { {
+ .magic = "070701",
+ .ino = 1,
+ .mode = S_IFREG | 0777,
+ .uid = 0,
+ .gid = 0,
+ .nlink = 1,
+ .mtime = 1,
+ .filesize = sizeof("ASDF") - 1,
+ .devmajor = 0,
+ .devminor = 1,
+ .rdevmajor = 0,
+ .rdevminor = 0,
+ .namesize = sizeof("initramfs_test_data"),
+ .csum = 0,
+ .fname = "initramfs_test_data",
+ .data = "ASDF",
+ } };
+
+ /* +6 for max name and data 4-byte padding */
+ cpio_srcbuf = kmalloc(CPIO_HDRLEN + c[0].namesize + c[0].filesize + 6,
+ GFP_KERNEL);
+
+ len = fill_cpio(c, ARRAY_SIZE(c), cpio_srcbuf);
+
+ err = unpack_to_rootfs(cpio_srcbuf, len);
+ KUNIT_EXPECT_NULL(test, err);
+
+ file = filp_open(c[0].fname, O_RDONLY, 0);
+ if (IS_ERR(file)) {
+ KUNIT_FAIL(test, "open failed");
+ goto out;
+ }
+
+ /* read back file contents into @cpio_srcbuf and confirm match */
+ len = kernel_read(file, cpio_srcbuf, c[0].filesize, NULL);
+ KUNIT_EXPECT_EQ(test, len, c[0].filesize);
+ KUNIT_EXPECT_MEMEQ(test, cpio_srcbuf, c[0].data, len);
+
+ fput(file);
+ KUNIT_EXPECT_EQ(test, init_unlink(c[0].fname), 0);
+out:
+ kfree(cpio_srcbuf);
+}
+
+static void __init initramfs_test_csum(struct kunit *test)
+{
+ char *err, *cpio_srcbuf;
+ size_t len;
+ struct initramfs_test_cpio c[] = { {
+ /* 070702 magic indicates a valid csum is present */
+ .magic = "070702",
+ .ino = 1,
+ .mode = S_IFREG | 0777,
+ .nlink = 1,
+ .filesize = sizeof("ASDF") - 1,
+ .devminor = 1,
+ .namesize = sizeof("initramfs_test_csum"),
+ .csum = 'A' + 'S' + 'D' + 'F',
+ .fname = "initramfs_test_csum",
+ .data = "ASDF",
+ }, {
+ /* mix csum entry above with no-csum entry below */
+ .magic = "070701",
+ .ino = 2,
+ .mode = S_IFREG | 0777,
+ .nlink = 1,
+ .filesize = sizeof("ASDF") - 1,
+ .devminor = 1,
+ .namesize = sizeof("initramfs_test_csum_not_here"),
+ /* csum ignored */
+ .csum = 5555,
+ .fname = "initramfs_test_csum_not_here",
+ .data = "ASDF",
+ } };
+
+ cpio_srcbuf = kmalloc(8192, GFP_KERNEL);
+
+ len = fill_cpio(c, ARRAY_SIZE(c), cpio_srcbuf);
+
+ err = unpack_to_rootfs(cpio_srcbuf, len);
+ KUNIT_EXPECT_NULL(test, err);
+
+ KUNIT_EXPECT_EQ(test, init_unlink(c[0].fname), 0);
+ KUNIT_EXPECT_EQ(test, init_unlink(c[1].fname), 0);
+
+ /* mess up the csum and confirm that unpack fails */
+ c[0].csum--;
+ len = fill_cpio(c, ARRAY_SIZE(c), cpio_srcbuf);
+
+ err = unpack_to_rootfs(cpio_srcbuf, len);
+ KUNIT_EXPECT_NOT_NULL(test, err);
+
+ /*
+ * file (with content) is still retained in case of bad-csum abort.
+ * Perhaps we should change this.
+ */
+ KUNIT_EXPECT_EQ(test, init_unlink(c[0].fname), 0);
+ KUNIT_EXPECT_EQ(test, init_unlink(c[1].fname), -ENOENT);
+ kfree(cpio_srcbuf);
+}
+
+/*
+ * hardlink hashtable may leak when the archive omits a trailer:
+ * https://lore.kernel.org/r/20241107002044.16477-10-ddiss@suse.de/
+ */
+static void __init initramfs_test_hardlink(struct kunit *test)
+{
+ char *err, *cpio_srcbuf;
+ size_t len;
+ struct kstat st0, st1;
+ struct initramfs_test_cpio c[] = { {
+ .magic = "070701",
+ .ino = 1,
+ .mode = S_IFREG | 0777,
+ .nlink = 2,
+ .devminor = 1,
+ .namesize = sizeof("initramfs_test_hardlink"),
+ .fname = "initramfs_test_hardlink",
+ }, {
+ /* hardlink data is present in last archive entry */
+ .magic = "070701",
+ .ino = 1,
+ .mode = S_IFREG | 0777,
+ .nlink = 2,
+ .filesize = sizeof("ASDF") - 1,
+ .devminor = 1,
+ .namesize = sizeof("initramfs_test_hardlink_link"),
+ .fname = "initramfs_test_hardlink_link",
+ .data = "ASDF",
+ } };
+
+ cpio_srcbuf = kmalloc(8192, GFP_KERNEL);
+
+ len = fill_cpio(c, ARRAY_SIZE(c), cpio_srcbuf);
+
+ err = unpack_to_rootfs(cpio_srcbuf, len);
+ KUNIT_EXPECT_NULL(test, err);
+
+ KUNIT_EXPECT_EQ(test, init_stat(c[0].fname, &st0, 0), 0);
+ KUNIT_EXPECT_EQ(test, init_stat(c[1].fname, &st1, 0), 0);
+ KUNIT_EXPECT_EQ(test, st0.ino, st1.ino);
+ KUNIT_EXPECT_EQ(test, st0.nlink, 2);
+ KUNIT_EXPECT_EQ(test, st1.nlink, 2);
+
+ KUNIT_EXPECT_EQ(test, init_unlink(c[0].fname), 0);
+ KUNIT_EXPECT_EQ(test, init_unlink(c[1].fname), 0);
+
+ kfree(cpio_srcbuf);
+}
+
+#define INITRAMFS_TEST_MANY_LIMIT 1000
+#define INITRAMFS_TEST_MANY_PATH_MAX (sizeof("initramfs_test_many-") \
+ + sizeof(__stringify(INITRAMFS_TEST_MANY_LIMIT)))
+static void __init initramfs_test_many(struct kunit *test)
+{
+ char *err, *cpio_srcbuf, *p;
+ size_t len = INITRAMFS_TEST_MANY_LIMIT *
+ (CPIO_HDRLEN + INITRAMFS_TEST_MANY_PATH_MAX + 3);
+ char thispath[INITRAMFS_TEST_MANY_PATH_MAX];
+ int i;
+
+ p = cpio_srcbuf = kmalloc(len, GFP_KERNEL);
+
+ for (i = 0; i < INITRAMFS_TEST_MANY_LIMIT; i++) {
+ struct initramfs_test_cpio c = {
+ .magic = "070701",
+ .ino = i,
+ .mode = S_IFREG | 0777,
+ .nlink = 1,
+ .devminor = 1,
+ .fname = thispath,
+ };
+
+ c.namesize = 1 + sprintf(thispath, "initramfs_test_many-%d", i);
+ p += fill_cpio(&c, 1, p);
+ }
+
+ len = p - cpio_srcbuf;
+ err = unpack_to_rootfs(cpio_srcbuf, len);
+ KUNIT_EXPECT_NULL(test, err);
+
+ for (i = 0; i < INITRAMFS_TEST_MANY_LIMIT; i++) {
+ sprintf(thispath, "initramfs_test_many-%d", i);
+ KUNIT_EXPECT_EQ(test, init_unlink(thispath), 0);
+ }
+
+ kfree(cpio_srcbuf);
+}
+
+/*
+ * An initramfs filename is namesize in length, including the zero-terminator.
+ * A filename can be zero-terminated prior to namesize, with the remainder used
+ * as padding. This can be useful for e.g. alignment of file data segments with
+ * a 4KB filesystem block, allowing for extent sharing (reflinks) between cpio
+ * source and destination. This hack works with both GNU cpio and initramfs, as
+ * long as PATH_MAX isn't exceeded.
+ */
+static void __init initramfs_test_fname_pad(struct kunit *test)
+{
+ char *err;
+ size_t len;
+ struct file *file;
+ char fdata[] = "this file data is aligned at 4K in the archive";
+ struct test_fname_pad {
+ char padded_fname[4096 - CPIO_HDRLEN];
+ char cpio_srcbuf[CPIO_HDRLEN + PATH_MAX + 3 + sizeof(fdata)];
+ } *tbufs = kzalloc(sizeof(struct test_fname_pad), GFP_KERNEL);
+ struct initramfs_test_cpio c[] = { {
+ .magic = "070701",
+ .ino = 1,
+ .mode = S_IFREG | 0777,
+ .uid = 0,
+ .gid = 0,
+ .nlink = 1,
+ .mtime = 1,
+ .filesize = sizeof(fdata),
+ .devmajor = 0,
+ .devminor = 1,
+ .rdevmajor = 0,
+ .rdevminor = 0,
+ /* align file data at 4K archive offset via padded fname */
+ .namesize = 4096 - CPIO_HDRLEN,
+ .csum = 0,
+ .fname = tbufs->padded_fname,
+ .data = fdata,
+ } };
+
+ memcpy(tbufs->padded_fname, "padded_fname", sizeof("padded_fname"));
+ len = fill_cpio(c, ARRAY_SIZE(c), tbufs->cpio_srcbuf);
+
+ err = unpack_to_rootfs(tbufs->cpio_srcbuf, len);
+ KUNIT_EXPECT_NULL(test, err);
+
+ file = filp_open(c[0].fname, O_RDONLY, 0);
+ if (IS_ERR(file)) {
+ KUNIT_FAIL(test, "open failed");
+ goto out;
+ }
+
+ /* read back file contents into @cpio_srcbuf and confirm match */
+ len = kernel_read(file, tbufs->cpio_srcbuf, c[0].filesize, NULL);
+ KUNIT_EXPECT_EQ(test, len, c[0].filesize);
+ KUNIT_EXPECT_MEMEQ(test, tbufs->cpio_srcbuf, c[0].data, len);
+
+ fput(file);
+ KUNIT_EXPECT_EQ(test, init_unlink(c[0].fname), 0);
+out:
+ kfree(tbufs);
+}
+
+/*
+ * The kunit_case/_suite struct cannot be marked as __initdata as this will be
+ * used in debugfs to retrieve results after test has run.
+ */
+static struct kunit_case __refdata initramfs_test_cases[] = {
+ KUNIT_CASE(initramfs_test_extract),
+ KUNIT_CASE(initramfs_test_fname_overrun),
+ KUNIT_CASE(initramfs_test_data),
+ KUNIT_CASE(initramfs_test_csum),
+ KUNIT_CASE(initramfs_test_hardlink),
+ KUNIT_CASE(initramfs_test_many),
+ KUNIT_CASE(initramfs_test_fname_pad),
+ {},
+};
+
+static struct kunit_suite initramfs_test_suite = {
+ .name = "initramfs",
+ .test_cases = initramfs_test_cases,
+};
+kunit_test_init_section_suites(&initramfs_test_suite);
+
+MODULE_DESCRIPTION("Initramfs KUnit test suite");
+MODULE_LICENSE("GPL v2");
diff --git a/init/main.c b/init/main.c
index d03d2ec2eacf..b84818ad9685 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/init/main.c
*
@@ -6,14 +7,17 @@
* GK 2/5/95 - Changed to support mounting root fs via NFS
* Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96
* Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96
- * Simplified starting of init: Michael A. Griffith <grif@acm.org>
+ * Simplified starting of init: Michael A. Griffith <grif@acm.org>
*/
#define DEBUG /* Enable initcall_debug */
#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/extable.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
+#include <linux/binfmts.h>
#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/stackprotector.h>
@@ -23,83 +27,96 @@
#include <linux/ioport.h>
#include <linux/init.h>
#include <linux/initrd.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/acpi.h>
-#include <linux/tty.h>
+#include <linux/bootconfig.h>
+#include <linux/console.h>
+#include <linux/nmi.h>
#include <linux/percpu.h>
#include <linux/kmod.h>
+#include <linux/kprobes.h>
+#include <linux/kmsan.h>
#include <linux/vmalloc.h>
#include <linux/kernel_stat.h>
#include <linux/start_kernel.h>
#include <linux/security.h>
#include <linux/smp.h>
#include <linux/profile.h>
+#include <linux/kfence.h>
#include <linux/rcupdate.h>
+#include <linux/srcu.h>
#include <linux/moduleparam.h>
#include <linux/kallsyms.h>
+#include <linux/buildid.h>
#include <linux/writeback.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
+#include <linux/memcontrol.h>
#include <linux/cgroup.h>
-#include <linux/efi.h>
#include <linux/tick.h>
+#include <linux/sched/isolation.h>
#include <linux/interrupt.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
#include <linux/unistd.h>
+#include <linux/utsname.h>
#include <linux/rmap.h>
#include <linux/mempolicy.h>
#include <linux/key.h>
-#include <linux/buffer_head.h>
-#include <linux/page_cgroup.h>
#include <linux/debug_locks.h>
#include <linux/debugobjects.h>
#include <linux/lockdep.h>
#include <linux/kmemleak.h>
+#include <linux/padata.h>
#include <linux/pid_namespace.h>
-#include <linux/device.h>
+#include <linux/device/driver.h>
#include <linux/kthread.h>
#include <linux/sched.h>
+#include <linux/sched/init.h>
#include <linux/signal.h>
#include <linux/idr.h>
#include <linux/kgdb.h>
#include <linux/ftrace.h>
#include <linux/async.h>
-#include <linux/kmemcheck.h>
-#include <linux/sfi.h>
#include <linux/shmem_fs.h>
#include <linux/slab.h>
#include <linux/perf_event.h>
-#include <linux/file.h>
#include <linux/ptrace.h>
+#include <linux/pti.h>
#include <linux/blkdev.h>
-#include <linux/elevator.h>
-#include <linux/sched_clock.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/context_tracking.h>
+#include <linux/random.h>
+#include <linux/moduleloader.h>
+#include <linux/list.h>
+#include <linux/integrity.h>
+#include <linux/proc_ns.h>
+#include <linux/io.h>
+#include <linux/cache.h>
+#include <linux/rodata_test.h>
+#include <linux/jump_label.h>
+#include <linux/kcsan.h>
+#include <linux/init_syscalls.h>
+#include <linux/stackdepot.h>
+#include <linux/randomize_kstack.h>
+#include <linux/pidfs.h>
+#include <linux/ptdump.h>
+#include <linux/time_namespace.h>
+#include <net/net_namespace.h>
#include <asm/io.h>
-#include <asm/bugs.h>
#include <asm/setup.h>
#include <asm/sections.h>
#include <asm/cacheflush.h>
-#ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/smp.h>
-#endif
-
-static int kernel_init(void *);
+#define CREATE_TRACE_POINTS
+#include <trace/events/initcall.h>
-extern void init_IRQ(void);
-extern void fork_init(unsigned long);
-extern void mca_init(void);
-extern void sbus_init(void);
-extern void radix_tree_init(void);
-#ifndef CONFIG_DEBUG_RODATA
-static inline void mark_rodata_ro(void) { }
-#endif
+#include <kunit/test.h>
-#ifdef CONFIG_TC
-extern void tc_init(void);
-#endif
+static int kernel_init(void *);
/*
* Debug helper: via this flag we know that we are in 'early bootup code'
@@ -119,20 +136,39 @@ EXPORT_SYMBOL(system_state);
#define MAX_INIT_ARGS CONFIG_INIT_ENV_ARG_LIMIT
#define MAX_INIT_ENVS CONFIG_INIT_ENV_ARG_LIMIT
-extern void time_init(void);
/* Default late time init is NULL. archs can override this later. */
void (*__initdata late_time_init)(void);
-extern void softirq_init(void);
/* Untouched command line saved by arch-specific code. */
char __initdata boot_command_line[COMMAND_LINE_SIZE];
/* Untouched saved command line (eg. for /proc) */
-char *saved_command_line;
+char *saved_command_line __ro_after_init;
+unsigned int saved_command_line_len __ro_after_init;
/* Command line for parameter parsing */
static char *static_command_line;
+/* Untouched extra command line */
+static char *extra_command_line;
+/* Extra init arguments */
+static char *extra_init_args;
+
+#ifdef CONFIG_BOOT_CONFIG
+/* Is bootconfig on command line? */
+static bool bootconfig_found;
+static size_t initargs_offs;
+#else
+# define bootconfig_found false
+# define initargs_offs 0
+#endif
static char *execute_command;
-static char *ramdisk_execute_command;
+static char *ramdisk_execute_command = "/init";
+
+/*
+ * Used to generate warnings if static_key manipulation functions are used
+ * before jump_label_init is called.
+ */
+bool static_key_initialized __read_mostly;
+EXPORT_SYMBOL_GPL(static_key_initialized);
/*
* If set, this is an indication to the drivers that reset the underlying
@@ -140,7 +176,7 @@ static char *ramdisk_execute_command;
* rely on the BIOS and skip the reset operation.
*
* This is useful if kernel is booting in an unreliable environment.
- * For ex. kdump situaiton where previous kernel has crashed, BIOS has been
+ * For ex. kdump situation where previous kernel has crashed, BIOS has been
* skipped and devices will be in unknown state.
*/
unsigned int reset_devices;
@@ -154,16 +190,14 @@ static int __init set_reset_devices(char *str)
__setup("reset_devices", set_reset_devices);
-static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
-const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
+static const char *argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
+const char *envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
static const char *panic_later, *panic_param;
-extern const struct obs_kernel_param __setup_start[], __setup_end[];
-
-static int __init obsolete_checksetup(char *line)
+static bool __init obsolete_checksetup(char *line)
{
const struct obs_kernel_param *p;
- int had_early_param = 0;
+ bool had_early_param = false;
p = __setup_start;
do {
@@ -175,13 +209,13 @@ static int __init obsolete_checksetup(char *line)
* Keep iterating, as we can have early
* params and __setups of same names 8( */
if (line[n] == '\0' || line[n] == '=')
- had_early_param = 1;
+ had_early_param = true;
} else if (!p->setup_func) {
pr_warn("Parameter %s is obsolete, ignored\n",
p->str);
- return 1;
+ return true;
} else if (p->setup_func(line + n))
- return 1;
+ return true;
}
p++;
} while (p < __setup_end);
@@ -194,18 +228,17 @@ static int __init obsolete_checksetup(char *line)
* still work even if initially too large, it will just take slightly longer
*/
unsigned long loops_per_jiffy = (1<<12);
-
EXPORT_SYMBOL(loops_per_jiffy);
static int __init debug_kernel(char *str)
{
- console_loglevel = 10;
+ console_loglevel = CONSOLE_LOGLEVEL_DEBUG;
return 0;
}
static int __init quiet_kernel(char *str)
{
- console_loglevel = 4;
+ console_loglevel = CONSOLE_LOGLEVEL_QUIET;
return 0;
}
@@ -231,8 +264,244 @@ static int __init loglevel(char *str)
early_param("loglevel", loglevel);
+#ifdef CONFIG_BLK_DEV_INITRD
+static void * __init get_boot_config_from_initrd(size_t *_size)
+{
+ u32 size, csum;
+ char *data;
+ u32 *hdr;
+ int i;
+
+ if (!initrd_end)
+ return NULL;
+
+ data = (char *)initrd_end - BOOTCONFIG_MAGIC_LEN;
+ /*
+ * Since Grub may align the size of initrd to 4, we must
+ * check the preceding 3 bytes as well.
+ */
+ for (i = 0; i < 4; i++) {
+ if (!memcmp(data, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN))
+ goto found;
+ data--;
+ }
+ return NULL;
+
+found:
+ hdr = (u32 *)(data - 8);
+ size = le32_to_cpu(hdr[0]);
+ csum = le32_to_cpu(hdr[1]);
+
+ data = ((void *)hdr) - size;
+ if ((unsigned long)data < initrd_start) {
+ pr_err("bootconfig size %d is greater than initrd size %ld\n",
+ size, initrd_end - initrd_start);
+ return NULL;
+ }
+
+ if (xbc_calc_checksum(data, size) != csum) {
+ pr_err("bootconfig checksum failed\n");
+ return NULL;
+ }
+
+ /* Remove bootconfig from initramfs/initrd */
+ initrd_end = (unsigned long)data;
+ if (_size)
+ *_size = size;
+
+ return data;
+}
+#else
+static void * __init get_boot_config_from_initrd(size_t *_size)
+{
+ return NULL;
+}
+#endif
+
+#ifdef CONFIG_BOOT_CONFIG
+
+static char xbc_namebuf[XBC_KEYLEN_MAX] __initdata;
+
+#define rest(dst, end) ((end) > (dst) ? (end) - (dst) : 0)
+
+static int __init xbc_snprint_cmdline(char *buf, size_t size,
+ struct xbc_node *root)
+{
+ struct xbc_node *knode, *vnode;
+ char *end = buf + size;
+ const char *val, *q;
+ int ret;
+
+ xbc_node_for_each_key_value(root, knode, val) {
+ ret = xbc_node_compose_key_after(root, knode,
+ xbc_namebuf, XBC_KEYLEN_MAX);
+ if (ret < 0)
+ return ret;
+
+ vnode = xbc_node_get_child(knode);
+ if (!vnode) {
+ ret = snprintf(buf, rest(buf, end), "%s ", xbc_namebuf);
+ if (ret < 0)
+ return ret;
+ buf += ret;
+ continue;
+ }
+ xbc_array_for_each_value(vnode, val) {
+ /*
+ * For prettier and more readable /proc/cmdline, only
+ * quote the value when necessary, i.e. when it contains
+ * whitespace.
+ */
+ q = strpbrk(val, " \t\r\n") ? "\"" : "";
+ ret = snprintf(buf, rest(buf, end), "%s=%s%s%s ",
+ xbc_namebuf, q, val, q);
+ if (ret < 0)
+ return ret;
+ buf += ret;
+ }
+ }
+
+ return buf - (end - size);
+}
+#undef rest
+
+/* Make an extra command line under given key word */
+static char * __init xbc_make_cmdline(const char *key)
+{
+ struct xbc_node *root;
+ char *new_cmdline;
+ int ret, len = 0;
+
+ root = xbc_find_node(key);
+ if (!root)
+ return NULL;
+
+ /* Count required buffer size */
+ len = xbc_snprint_cmdline(NULL, 0, root);
+ if (len <= 0)
+ return NULL;
+
+ new_cmdline = memblock_alloc(len + 1, SMP_CACHE_BYTES);
+ if (!new_cmdline) {
+ pr_err("Failed to allocate memory for extra kernel cmdline.\n");
+ return NULL;
+ }
+
+ ret = xbc_snprint_cmdline(new_cmdline, len + 1, root);
+ if (ret < 0 || ret > len) {
+ pr_err("Failed to print extra kernel cmdline.\n");
+ memblock_free(new_cmdline, len + 1);
+ return NULL;
+ }
+
+ return new_cmdline;
+}
+
+static int __init bootconfig_params(char *param, char *val,
+ const char *unused, void *arg)
+{
+ if (strcmp(param, "bootconfig") == 0) {
+ bootconfig_found = true;
+ }
+ return 0;
+}
+
+static int __init warn_bootconfig(char *str)
+{
+ /* The 'bootconfig' has been handled by bootconfig_params(). */
+ return 0;
+}
+
+static void __init setup_boot_config(void)
+{
+ static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata;
+ const char *msg, *data;
+ int pos, ret;
+ size_t size;
+ char *err;
+
+ /* Cut out the bootconfig data even if we have no bootconfig option */
+ data = get_boot_config_from_initrd(&size);
+ /* If there is no bootconfig in initrd, try embedded one. */
+ if (!data)
+ data = xbc_get_embedded_bootconfig(&size);
+
+ strscpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+ err = parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL,
+ bootconfig_params);
+
+ if (IS_ERR(err) || !(bootconfig_found || IS_ENABLED(CONFIG_BOOT_CONFIG_FORCE)))
+ return;
+
+ /* parse_args() stops at the next param of '--' and returns an address */
+ if (err)
+ initargs_offs = err - tmp_cmdline;
+
+ if (!data) {
+ /* If user intended to use bootconfig, show an error level message */
+ if (bootconfig_found)
+ pr_err("'bootconfig' found on command line, but no bootconfig found\n");
+ else
+ pr_info("No bootconfig data provided, so skipping bootconfig");
+ return;
+ }
+
+ if (size >= XBC_DATA_MAX) {
+ pr_err("bootconfig size %ld greater than max size %d\n",
+ (long)size, XBC_DATA_MAX);
+ return;
+ }
+
+ ret = xbc_init(data, size, &msg, &pos);
+ if (ret < 0) {
+ if (pos < 0)
+ pr_err("Failed to init bootconfig: %s.\n", msg);
+ else
+ pr_err("Failed to parse bootconfig: %s at %d.\n",
+ msg, pos);
+ } else {
+ xbc_get_info(&ret, NULL);
+ pr_info("Load bootconfig: %ld bytes %d nodes\n", (long)size, ret);
+ /* keys starting with "kernel." are passed via cmdline */
+ extra_command_line = xbc_make_cmdline("kernel");
+ /* Also, "init." keys are init arguments */
+ extra_init_args = xbc_make_cmdline("init");
+ }
+ return;
+}
+
+static void __init exit_boot_config(void)
+{
+ xbc_exit();
+}
+
+#else /* !CONFIG_BOOT_CONFIG */
+
+static void __init setup_boot_config(void)
+{
+ /* Remove bootconfig data from initrd */
+ get_boot_config_from_initrd(NULL);
+}
+
+static int __init warn_bootconfig(char *str)
+{
+ pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOT_CONFIG is not set.\n");
+ return 0;
+}
+
+#define exit_boot_config() do {} while (0)
+
+#endif /* CONFIG_BOOT_CONFIG */
+
+early_param("bootconfig", warn_bootconfig);
+
+bool __init cmdline_has_extra_options(void)
+{
+ return extra_command_line || extra_init_args;
+}
+
/* Change NUL term back to "=", to make "param" the whole string. */
-static int __init repair_env_string(char *param, char *val, const char *unused)
+static void __init repair_env_string(char *param, char *val)
{
if (val) {
/* param=val or param="val"? */
@@ -241,10 +510,30 @@ static int __init repair_env_string(char *param, char *val, const char *unused)
else if (val == param+strlen(param)+2) {
val[-2] = '=';
memmove(val-1, val, strlen(val)+1);
- val--;
} else
BUG();
}
+}
+
+/* Anything after -- gets handed straight to init. */
+static int __init set_init_arg(char *param, char *val,
+ const char *unused, void *arg)
+{
+ unsigned int i;
+
+ if (panic_later)
+ return 0;
+
+ repair_env_string(param, val);
+
+ for (i = 0; argv_init[i]; i++) {
+ if (i == MAX_INIT_ARGS) {
+ panic_later = "init";
+ panic_param = param;
+ return 0;
+ }
+ }
+ argv_init[i] = param;
return 0;
}
@@ -252,16 +541,35 @@ static int __init repair_env_string(char *param, char *val, const char *unused)
* Unknown boot options get handed to init, unless they look like
* unused parameters (modprobe will find them in /proc/cmdline).
*/
-static int __init unknown_bootoption(char *param, char *val, const char *unused)
+static int __init unknown_bootoption(char *param, char *val,
+ const char *unused, void *arg)
{
- repair_env_string(param, val, unused);
+ size_t len = strlen(param);
+ /*
+ * Well-known bootloader identifiers:
+ * 1. LILO/Grub pass "BOOT_IMAGE=...";
+ * 2. kexec/kdump (kexec-tools) pass "kexec".
+ */
+ const char *bootloader[] = { "BOOT_IMAGE=", "kexec", NULL };
+
+ /* Handle params aliased to sysctls */
+ if (sysctl_is_alias(param))
+ return 0;
+
+ repair_env_string(param, val);
+
+ /* Handle bootloader identifier */
+ for (int i = 0; bootloader[i]; i++) {
+ if (strstarts(param, bootloader[i]))
+ return 0;
+ }
/* Handle obsolete-style parameters */
if (obsolete_checksetup(param))
return 0;
/* Unused module parameter. */
- if (strchr(param, '.') && (!val || strchr(param, '.') < val))
+ if (strnchr(param, len, '.'))
return 0;
if (panic_later)
@@ -272,10 +580,10 @@ static int __init unknown_bootoption(char *param, char *val, const char *unused)
unsigned int i;
for (i = 0; envp_init[i]; i++) {
if (i == MAX_INIT_ENVS) {
- panic_later = "Too many boot env vars at `%s'";
+ panic_later = "env";
panic_param = param;
}
- if (!strncmp(param, envp_init[i], val - param))
+ if (!strncmp(param, envp_init[i], len+1))
break;
}
envp_init[i] = param;
@@ -284,7 +592,7 @@ static int __init unknown_bootoption(char *param, char *val, const char *unused)
unsigned int i;
for (i = 0; argv_init[i]; i++) {
if (i == MAX_INIT_ARGS) {
- panic_later = "Too many boot init vars at `%s'";
+ panic_later = "init";
panic_param = param;
}
}
@@ -323,16 +631,6 @@ static int __init rdinit_setup(char *str)
__setup("rdinit=", rdinit_setup);
#ifndef CONFIG_SMP
-static const unsigned int setup_max_cpus = NR_CPUS;
-#ifdef CONFIG_X86_LOCAL_APIC
-static void __init smp_init(void)
-{
- APIC_init_uniprocessor();
-}
-#else
-#define smp_init() do { } while (0)
-#endif
-
static inline void setup_nr_cpu_ids(void) { }
static inline void smp_prepare_cpus(unsigned int maxcpus) { }
#endif
@@ -345,10 +643,58 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { }
*/
static void __init setup_command_line(char *command_line)
{
- saved_command_line = alloc_bootmem(strlen (boot_command_line)+1);
- static_command_line = alloc_bootmem(strlen (command_line)+1);
- strcpy (saved_command_line, boot_command_line);
- strcpy (static_command_line, command_line);
+ size_t len, xlen = 0, ilen = 0;
+
+ if (extra_command_line)
+ xlen = strlen(extra_command_line);
+ if (extra_init_args) {
+ extra_init_args = strim(extra_init_args); /* remove trailing space */
+ ilen = strlen(extra_init_args) + 4; /* for " -- " */
+ }
+
+ len = xlen + strlen(boot_command_line) + ilen + 1;
+
+ saved_command_line = memblock_alloc_or_panic(len, SMP_CACHE_BYTES);
+
+ len = xlen + strlen(command_line) + 1;
+
+ static_command_line = memblock_alloc_or_panic(len, SMP_CACHE_BYTES);
+
+ if (xlen) {
+ /*
+ * We have to put extra_command_line before boot command
+ * lines because there could be dashes (separator of init
+ * command line) in the command lines.
+ */
+ strcpy(saved_command_line, extra_command_line);
+ strcpy(static_command_line, extra_command_line);
+ }
+ strcpy(saved_command_line + xlen, boot_command_line);
+ strcpy(static_command_line + xlen, command_line);
+
+ if (ilen) {
+ /*
+ * Append supplemental init boot args to saved_command_line
+ * so that user can check what command line options passed
+ * to init.
+ * The order should always be
+ * " -- "[bootconfig init-param][cmdline init-param]
+ */
+ if (initargs_offs) {
+ len = xlen + initargs_offs;
+ strcpy(saved_command_line + len, extra_init_args);
+ len += ilen - 4; /* strlen(extra_init_args) */
+ strcpy(saved_command_line + len,
+ boot_command_line + initargs_offs - 1);
+ } else {
+ len = strlen(saved_command_line);
+ strcpy(saved_command_line + len, " -- ");
+ len += 4;
+ strcpy(saved_command_line + len, extra_init_args);
+ }
+ }
+
+ saved_command_line_len = strlen(saved_command_line);
}
/*
@@ -362,8 +708,9 @@ static void __init setup_command_line(char *command_line)
static __initdata DECLARE_COMPLETION(kthreadd_done);
-static noinline void __init_refok rest_init(void)
+static noinline void __ref __noreturn rest_init(void)
{
+ struct task_struct *tsk;
int pid;
rcu_scheduler_starting();
@@ -372,34 +719,52 @@ static noinline void __init_refok rest_init(void)
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
- kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);
+ pid = user_mode_thread(kernel_init, NULL, CLONE_FS);
+ /*
+ * Pin init on the boot CPU. Task migration is not properly working
+ * until sched_init_smp() has been run. It will set the allowed
+ * CPUs for init to the non isolated CPUs.
+ */
+ rcu_read_lock();
+ tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+ tsk->flags |= PF_NO_SETAFFINITY;
+ set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
+ rcu_read_unlock();
+
numa_default_policy();
- pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
+ pid = kernel_thread(kthreadd, NULL, NULL, CLONE_FS | CLONE_FILES);
rcu_read_lock();
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
rcu_read_unlock();
+
+ /*
+ * Enable might_sleep() and smp_processor_id() checks.
+ * They cannot be enabled earlier because with CONFIG_PREEMPTION=y
+ * kernel_thread() would trigger might_sleep() splats. With
+ * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled
+ * already, but it's stuck on the kthreadd_done completion.
+ */
+ system_state = SYSTEM_SCHEDULING;
+
complete(&kthreadd_done);
/*
* The boot idle thread must execute schedule()
* at least once to get things moving:
*/
- init_idle_bootup_task(current);
schedule_preempt_disabled();
/* Call into cpu_idle with preempt disabled */
cpu_startup_entry(CPUHP_ONLINE);
}
/* Check for early params. */
-static int __init do_early_param(char *param, char *val, const char *unused)
+static int __init do_early_param(char *param, char *val,
+ const char *unused, void *arg)
{
const struct obs_kernel_param *p;
for (p = __setup_start; p < __setup_end; p++) {
- if ((p->early && parameq(param, p->str)) ||
- (strcmp(param, "console") == 0 &&
- strcmp(p->str, "earlycon") == 0)
- ) {
+ if (p->early && parameq(param, p->str)) {
if (p->setup_func(val) != 0)
pr_warn("Malformed early option '%s'\n", param);
}
@@ -410,124 +775,301 @@ static int __init do_early_param(char *param, char *val, const char *unused)
void __init parse_early_options(char *cmdline)
{
- parse_args("early options", cmdline, NULL, 0, 0, 0, do_early_param);
+ parse_args("early options", cmdline, NULL, 0, 0, 0, NULL,
+ do_early_param);
}
/* Arch code calls this early on, or if not, just before other parsing. */
void __init parse_early_param(void)
{
- static __initdata int done = 0;
- static __initdata char tmp_cmdline[COMMAND_LINE_SIZE];
+ static int done __initdata;
+ static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata;
if (done)
return;
/* All fall through to do_early_param. */
- strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+ strscpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
parse_early_options(tmp_cmdline);
done = 1;
}
-/*
- * Activate the first processor.
- */
+void __init __weak arch_post_acpi_subsys_init(void) { }
-static void __init boot_cpu_init(void)
+void __init __weak smp_setup_processor_id(void)
{
- int cpu = smp_processor_id();
- /* Mark the boot cpu "present", "online" etc for SMP and UP case */
- set_cpu_online(cpu, true);
- set_cpu_active(cpu, true);
- set_cpu_present(cpu, true);
- set_cpu_possible(cpu, true);
}
-void __init __weak smp_setup_processor_id(void)
+void __init __weak smp_prepare_boot_cpu(void)
{
}
# if THREAD_SIZE >= PAGE_SIZE
-void __init __weak thread_info_cache_init(void)
+void __init __weak thread_stack_cache_init(void)
{
}
#endif
-/*
- * Set up kernel memory allocators
- */
-static void __init mm_init(void)
+void __init __weak poking_init(void) { }
+
+void __init __weak pgtable_cache_init(void) { }
+
+void __init __weak trap_init(void) { }
+
+bool initcall_debug;
+core_param(initcall_debug, initcall_debug, bool, 0644);
+
+#ifdef TRACEPOINTS_ENABLED
+static void __init initcall_debug_enable(void);
+#else
+static inline void initcall_debug_enable(void)
{
- /*
- * page_cgroup requires contiguous pages,
- * bigger than MAX_ORDER unless SPARSEMEM.
- */
- page_cgroup_init_flatmem();
- mem_init();
- kmem_cache_init();
- percpu_init_late();
- pgtable_cache_init();
- vmalloc_init();
}
+#endif
-asmlinkage void __init start_kernel(void)
+#ifdef CONFIG_RANDOMIZE_KSTACK_OFFSET
+DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT,
+ randomize_kstack_offset);
+DEFINE_PER_CPU(u32, kstack_offset);
+
+static int __init early_randomize_kstack_offset(char *buf)
{
- char * command_line;
- extern const struct kernel_param __start___param[], __stop___param[];
+ int ret;
+ bool bool_result;
+
+ ret = kstrtobool(buf, &bool_result);
+ if (ret)
+ return ret;
+
+ if (bool_result)
+ static_branch_enable(&randomize_kstack_offset);
+ else
+ static_branch_disable(&randomize_kstack_offset);
+ return 0;
+}
+early_param("randomize_kstack_offset", early_randomize_kstack_offset);
+#endif
+
+static void __init print_unknown_bootoptions(void)
+{
+ char *unknown_options;
+ char *end;
+ const char *const *p;
+ size_t len;
+
+ if (panic_later || (!argv_init[1] && !envp_init[2]))
+ return;
/*
- * Need to run as early as possible, to initialize the
- * lockdep hash:
+ * Determine how many options we have to print out, plus a space
+ * before each
*/
- lockdep_init();
+ len = 1; /* null terminator */
+ for (p = &argv_init[1]; *p; p++) {
+ len++;
+ len += strlen(*p);
+ }
+ for (p = &envp_init[2]; *p; p++) {
+ len++;
+ len += strlen(*p);
+ }
+
+ unknown_options = memblock_alloc(len, SMP_CACHE_BYTES);
+ if (!unknown_options) {
+ pr_err("%s: Failed to allocate %zu bytes\n",
+ __func__, len);
+ return;
+ }
+ end = unknown_options;
+
+ for (p = &argv_init[1]; *p; p++)
+ end += sprintf(end, " %s", *p);
+ for (p = &envp_init[2]; *p; p++)
+ end += sprintf(end, " %s", *p);
+
+ /* Start at unknown_options[1] to skip the initial space */
+ pr_notice("Unknown kernel command line parameters \"%s\", will be passed to user space.\n",
+ &unknown_options[1]);
+ memblock_free(unknown_options, len);
+}
+
+static void __init early_numa_node_init(void)
+{
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+#ifndef cpu_to_node
+ int cpu;
+
+ /* The early_cpu_to_node() should be ready here. */
+ for_each_possible_cpu(cpu)
+ set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
+#endif
+#endif
+}
+
+#define KERNEL_CMDLINE_PREFIX "Kernel command line: "
+#define KERNEL_CMDLINE_PREFIX_LEN (sizeof(KERNEL_CMDLINE_PREFIX) - 1)
+#define KERNEL_CMDLINE_CONTINUATION " \\"
+#define KERNEL_CMDLINE_CONTINUATION_LEN (sizeof(KERNEL_CMDLINE_CONTINUATION) - 1)
+
+#define MIN_CMDLINE_LOG_WRAP_IDEAL_LEN (KERNEL_CMDLINE_PREFIX_LEN + \
+ KERNEL_CMDLINE_CONTINUATION_LEN)
+#define CMDLINE_LOG_WRAP_IDEAL_LEN (CONFIG_CMDLINE_LOG_WRAP_IDEAL_LEN > \
+ MIN_CMDLINE_LOG_WRAP_IDEAL_LEN ? \
+ CONFIG_CMDLINE_LOG_WRAP_IDEAL_LEN : \
+ MIN_CMDLINE_LOG_WRAP_IDEAL_LEN)
+
+#define IDEAL_CMDLINE_LEN (CMDLINE_LOG_WRAP_IDEAL_LEN - KERNEL_CMDLINE_PREFIX_LEN)
+#define IDEAL_CMDLINE_SPLIT_LEN (IDEAL_CMDLINE_LEN - KERNEL_CMDLINE_CONTINUATION_LEN)
+
+/**
+ * print_kernel_cmdline() - Print the kernel cmdline with wrapping.
+ * @cmdline: The cmdline to print.
+ *
+ * Print the kernel command line, trying to wrap based on the Kconfig knob
+ * CONFIG_CMDLINE_LOG_WRAP_IDEAL_LEN.
+ *
+ * Wrapping is based on spaces, ignoring quotes. All lines are prefixed
+ * with "Kernel command line: " and lines that are not the last line have
+ * a " \" suffix added to them. The prefix and suffix count towards the
+ * line length for wrapping purposes. The ideal length will be exceeded
+ * if no appropriate place to wrap is found.
+ *
+ * Example output if CONFIG_CMDLINE_LOG_WRAP_IDEAL_LEN is 40:
+ * Kernel command line: loglevel=7 \
+ * Kernel command line: init=/sbin/init \
+ * Kernel command line: root=PARTUUID=8c3efc1a-768b-6642-8d0c-89eb782f19f0/PARTNROFF=1 \
+ * Kernel command line: rootwait ro \
+ * Kernel command line: my_quoted_arg="The \
+ * Kernel command line: quick brown fox \
+ * Kernel command line: jumps over the \
+ * Kernel command line: lazy dog."
+ */
+static void __init print_kernel_cmdline(const char *cmdline)
+{
+ size_t len;
+
+ /* Config option of 0 or anything longer than the max disables wrapping */
+ if (CONFIG_CMDLINE_LOG_WRAP_IDEAL_LEN == 0 ||
+ IDEAL_CMDLINE_LEN >= COMMAND_LINE_SIZE - 1) {
+ pr_notice("%s%s\n", KERNEL_CMDLINE_PREFIX, cmdline);
+ return;
+ }
+
+ len = strlen(cmdline);
+ while (len > IDEAL_CMDLINE_LEN) {
+ const char *first_space;
+ const char *prev_cutoff;
+ const char *cutoff;
+ int to_print;
+ size_t used;
+
+ /* Find the last ' ' that wouldn't make the line too long */
+ prev_cutoff = NULL;
+ cutoff = cmdline;
+ while (true) {
+ cutoff = strchr(cutoff + 1, ' ');
+ if (!cutoff || cutoff - cmdline > IDEAL_CMDLINE_SPLIT_LEN)
+ break;
+ prev_cutoff = cutoff;
+ }
+ if (prev_cutoff)
+ cutoff = prev_cutoff;
+ else if (!cutoff)
+ break;
+
+ /* Find the beginning and end of the string of spaces */
+ first_space = cutoff;
+ while (first_space > cmdline && first_space[-1] == ' ')
+ first_space--;
+ to_print = first_space - cmdline;
+ while (*cutoff == ' ')
+ cutoff++;
+ used = cutoff - cmdline;
+
+ /* If the whole string is used, break and do the final printout */
+ if (len == used)
+ break;
+
+ if (to_print)
+ pr_notice("%s%.*s%s\n", KERNEL_CMDLINE_PREFIX,
+ to_print, cmdline, KERNEL_CMDLINE_CONTINUATION);
+
+ len -= used;
+ cmdline += used;
+ }
+ if (len)
+ pr_notice("%s%s\n", KERNEL_CMDLINE_PREFIX, cmdline);
+}
+
+asmlinkage __visible __init __no_sanitize_address __noreturn __no_stack_protector
+void start_kernel(void)
+{
+ char *command_line;
+ char *after_dashes;
+
+ set_task_stack_end_magic(&init_task);
smp_setup_processor_id();
debug_objects_early_init();
-
- /*
- * Set up the the initial canary ASAP:
- */
- boot_init_stack_canary();
+ init_vmlinux_build_id();
cgroup_init_early();
local_irq_disable();
early_boot_irqs_disabled = true;
-/*
- * Interrupts are still disabled. Do necessary setups, then
- * enable them
- */
+ /*
+ * Interrupts are still disabled. Do necessary setups, then
+ * enable them.
+ */
boot_cpu_init();
page_address_init();
pr_notice("%s", linux_banner);
setup_arch(&command_line);
- mm_init_owner(&init_mm, &init_task);
- mm_init_cpumask(&init_mm);
+ /* Static keys and static calls are needed by LSMs */
+ jump_label_init();
+ static_call_init();
+ early_security_init();
+ setup_boot_config();
setup_command_line(command_line);
setup_nr_cpu_ids();
setup_per_cpu_areas();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
+ early_numa_node_init();
+ boot_cpu_hotplug_init();
- build_all_zonelists(NULL, NULL);
- page_alloc_init();
-
- pr_notice("Kernel command line: %s\n", boot_command_line);
+ print_kernel_cmdline(saved_command_line);
+ /* parameters may set static keys */
parse_early_param();
- parse_args("Booting kernel", static_command_line, __start___param,
- __stop___param - __start___param,
- -1, -1, &unknown_bootoption);
-
- jump_label_init();
+ after_dashes = parse_args("Booting kernel",
+ static_command_line, __start___param,
+ __stop___param - __start___param,
+ -1, -1, NULL, &unknown_bootoption);
+ print_unknown_bootoptions();
+ if (!IS_ERR_OR_NULL(after_dashes))
+ parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
+ NULL, set_init_arg);
+ if (extra_init_args)
+ parse_args("Setting extra init args", extra_init_args,
+ NULL, 0, -1, -1, NULL, set_init_arg);
+
+ /* Architectural and non-timekeeping rng init, before allocator init */
+ random_init_early(command_line);
/*
* These use large bootmem allocations and must precede
- * kmem_cache_init()
+ * initalization of page allocator
*/
setup_log_buf(0);
- pidhash_init();
vfs_caches_init_early();
sort_main_extable();
trap_init();
- mm_init();
+ mm_core_init();
+ maple_tree_init();
+ poking_init();
+ ftrace_init();
+
+ /* trace_printk can be enabled here */
+ early_trace_init();
/*
* Set up the scheduler prior starting any interrupts (such as the
@@ -535,31 +1077,59 @@ asmlinkage void __init start_kernel(void)
* time - but meanwhile we still have a functioning scheduler.
*/
sched_init();
+
+ if (WARN(!irqs_disabled(),
+ "Interrupts were enabled *very* early, fixing it\n"))
+ local_irq_disable();
+ radix_tree_init();
+
/*
- * Disable preemption - early bootup scheduling is extremely
- * fragile until we cpu_idle() for the first time.
+ * Set up housekeeping before setting up workqueues to allow the unbound
+ * workqueue to take non-housekeeping into account.
*/
- preempt_disable();
- if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n"))
- local_irq_disable();
- idr_init_cache();
+ housekeeping_init();
+
+ /*
+ * Allow workqueue creation and work item queueing/cancelling
+ * early. Work item execution depends on kthreads and starts after
+ * workqueue_init().
+ */
+ workqueue_init_early();
+
rcu_init();
- tick_nohz_init();
- radix_tree_init();
+ kvfree_rcu_init();
+
+ /* Trace events are available after this */
+ trace_init();
+
+ if (initcall_debug)
+ initcall_debug_enable();
+
+ context_tracking_init();
/* init some links before init_ISA_irqs() */
early_irq_init();
init_IRQ();
tick_init();
- init_timers();
+ rcu_init_nohz();
+ timers_init();
+ srcu_init();
hrtimers_init();
softirq_init();
timekeeping_init();
time_init();
- sched_clock_postinit();
+
+ /* This must be after timekeeping is initialized */
+ random_init();
+
+ /* These make use of the fully initialized rng */
+ kfence_init();
+ boot_init_stack_canary();
+
perf_event_init();
profile_init();
call_function_init();
WARN(!irqs_disabled(), "Interrupts were enabled early\n");
+
early_boot_irqs_disabled = false;
local_irq_enable();
@@ -572,9 +1142,10 @@ asmlinkage void __init start_kernel(void)
*/
console_init();
if (panic_later)
- panic(panic_later, panic_param);
+ panic("Too many boot %s vars at `%s'", panic_later,
+ panic_param);
- lockdep_info();
+ lockdep_init();
/*
* Need to run this when irqs are enabled, because it wants
@@ -592,61 +1163,67 @@ asmlinkage void __init start_kernel(void)
initrd_start = 0;
}
#endif
- page_cgroup_init();
- debug_objects_mem_init();
- kmemleak_init();
setup_per_cpu_pageset();
numa_policy_init();
+ acpi_early_init();
if (late_time_init)
late_time_init();
sched_clock_init();
calibrate_delay();
- pidmap_init();
+
+ arch_cpu_finalize_init();
+
+ pid_idr_init();
anon_vma_init();
-#ifdef CONFIG_X86
- if (efi_enabled(EFI_RUNTIME_SERVICES))
- efi_enter_virtual_mode();
-#endif
- thread_info_cache_init();
+ thread_stack_cache_init();
cred_init();
- fork_init(totalram_pages);
+ fork_init();
proc_caches_init();
- buffer_init();
+ uts_ns_init();
+ time_ns_init();
key_init();
security_init();
dbg_late_init();
- vfs_caches_init(totalram_pages);
+ net_ns_init();
+ vfs_caches_init();
+ pagecache_init();
signals_init();
- /* rootfs populating might need page-writeback */
- page_writeback_init();
-#ifdef CONFIG_PROC_FS
+ seq_file_init();
proc_root_init();
-#endif
- cgroup_init();
+ nsfs_init();
+ pidfs_init();
cpuset_init();
+ mem_cgroup_init();
+ cgroup_init();
taskstats_init_early();
delayacct_init();
- check_bugs();
-
- acpi_early_init(); /* before LAPIC and SMP init */
- sfi_init_late();
-
- if (efi_enabled(EFI_RUNTIME_SERVICES)) {
- efi_late_init();
- efi_free_boot_services();
- }
-
- ftrace_init();
+ acpi_subsystem_init();
+ arch_post_acpi_subsys_init();
+ kcsan_init();
/* Do the rest non-__init'ed, we're now alive */
rest_init();
+
+ /*
+ * Avoid stack canaries in callers of boot_init_stack_canary for gcc-10
+ * and older.
+ */
+#if !__has_attribute(__no_stack_protector__)
+ prevent_tail_call_optimization();
+#endif
}
/* Call all constructor functions linked into the kernel. */
static void __init do_ctors(void)
{
-#ifdef CONFIG_CONSTRUCTORS
+/*
+ * For UML, the constructors have already been called by the
+ * normal setup code as it's just a normal ELF binary, so we
+ * cannot do it again - but we do need CONFIG_CONSTRUCTORS
+ * even on UML for modules.
+ */
+#if defined(CONFIG_CONSTRUCTORS) && !defined(CONFIG_UML)
ctor_fn_t *fn = (ctor_fn_t *) __ctors_start;
for (; fn < (ctor_fn_t *) __ctors_end; fn++)
@@ -654,66 +1231,171 @@ static void __init do_ctors(void)
#endif
}
-bool initcall_debug;
-core_param(initcall_debug, initcall_debug, bool, 0644);
+#ifdef CONFIG_KALLSYMS
+struct blacklist_entry {
+ struct list_head next;
+ char *buf;
+};
+
+static __initdata_or_module LIST_HEAD(blacklisted_initcalls);
-static int __init_or_module do_one_initcall_debug(initcall_t fn)
+static int __init initcall_blacklist(char *str)
{
- ktime_t calltime, delta, rettime;
- unsigned long long duration;
- int ret;
+ char *str_entry;
+ struct blacklist_entry *entry;
+
+ /* str argument is a comma-separated list of functions */
+ do {
+ str_entry = strsep(&str, ",");
+ if (str_entry) {
+ pr_debug("blacklisting initcall %s\n", str_entry);
+ entry = memblock_alloc_or_panic(sizeof(*entry),
+ SMP_CACHE_BYTES);
+ entry->buf = memblock_alloc_or_panic(strlen(str_entry) + 1,
+ SMP_CACHE_BYTES);
+ strcpy(entry->buf, str_entry);
+ list_add(&entry->next, &blacklisted_initcalls);
+ }
+ } while (str_entry);
+
+ return 1;
+}
+
+static bool __init_or_module initcall_blacklisted(initcall_t fn)
+{
+ struct blacklist_entry *entry;
+ char fn_name[KSYM_SYMBOL_LEN];
+ unsigned long addr;
+
+ if (list_empty(&blacklisted_initcalls))
+ return false;
+
+ addr = (unsigned long) dereference_function_descriptor(fn);
+ sprint_symbol_no_offset(fn_name, addr);
+
+ /*
+ * fn will be "function_name [module_name]" where [module_name] is not
+ * displayed for built-in init functions. Strip off the [module_name].
+ */
+ strreplace(fn_name, ' ', '\0');
+
+ list_for_each_entry(entry, &blacklisted_initcalls, next) {
+ if (!strcmp(fn_name, entry->buf)) {
+ pr_debug("initcall %s blacklisted\n", fn_name);
+ return true;
+ }
+ }
+
+ return false;
+}
+#else
+static int __init initcall_blacklist(char *str)
+{
+ pr_warn("initcall_blacklist requires CONFIG_KALLSYMS\n");
+ return 0;
+}
+
+static bool __init_or_module initcall_blacklisted(initcall_t fn)
+{
+ return false;
+}
+#endif
+__setup("initcall_blacklist=", initcall_blacklist);
+
+static __init_or_module void
+trace_initcall_start_cb(void *data, initcall_t fn)
+{
+ ktime_t *calltime = data;
+
+ printk(KERN_DEBUG "calling %pS @ %i\n", fn, task_pid_nr(current));
+ *calltime = ktime_get();
+}
+
+static __init_or_module void
+trace_initcall_finish_cb(void *data, initcall_t fn, int ret)
+{
+ ktime_t rettime, *calltime = data;
- pr_debug("calling %pF @ %i\n", fn, task_pid_nr(current));
- calltime = ktime_get();
- ret = fn();
rettime = ktime_get();
- delta = ktime_sub(rettime, calltime);
- duration = (unsigned long long) ktime_to_ns(delta) >> 10;
- pr_debug("initcall %pF returned %d after %lld usecs\n",
- fn, ret, duration);
+ printk(KERN_DEBUG "initcall %pS returned %d after %lld usecs\n",
+ fn, ret, (unsigned long long)ktime_us_delta(rettime, *calltime));
+}
- return ret;
+static __init_or_module void
+trace_initcall_level_cb(void *data, const char *level)
+{
+ printk(KERN_DEBUG "entering initcall level: %s\n", level);
+}
+
+static ktime_t initcall_calltime;
+
+#ifdef TRACEPOINTS_ENABLED
+static void __init initcall_debug_enable(void)
+{
+ int ret;
+
+ ret = register_trace_initcall_start(trace_initcall_start_cb,
+ &initcall_calltime);
+ ret |= register_trace_initcall_finish(trace_initcall_finish_cb,
+ &initcall_calltime);
+ ret |= register_trace_initcall_level(trace_initcall_level_cb, NULL);
+ WARN(ret, "Failed to register initcall tracepoints\n");
+}
+# define do_trace_initcall_start trace_initcall_start
+# define do_trace_initcall_finish trace_initcall_finish
+# define do_trace_initcall_level trace_initcall_level
+#else
+static inline void do_trace_initcall_start(initcall_t fn)
+{
+ if (!initcall_debug)
+ return;
+ trace_initcall_start_cb(&initcall_calltime, fn);
+}
+static inline void do_trace_initcall_finish(initcall_t fn, int ret)
+{
+ if (!initcall_debug)
+ return;
+ trace_initcall_finish_cb(&initcall_calltime, fn, ret);
+}
+static inline void do_trace_initcall_level(const char *level)
+{
+ if (!initcall_debug)
+ return;
+ trace_initcall_level_cb(NULL, level);
}
+#endif /* !TRACEPOINTS_ENABLED */
int __init_or_module do_one_initcall(initcall_t fn)
{
int count = preempt_count();
- int ret;
char msgbuf[64];
+ int ret;
- if (initcall_debug)
- ret = do_one_initcall_debug(fn);
- else
- ret = fn();
+ if (initcall_blacklisted(fn))
+ return -EPERM;
+
+ do_trace_initcall_start(fn);
+ ret = fn();
+ do_trace_initcall_finish(fn, ret);
msgbuf[0] = 0;
if (preempt_count() != count) {
sprintf(msgbuf, "preemption imbalance ");
- preempt_count() = count;
+ preempt_count_set(count);
}
if (irqs_disabled()) {
strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf));
local_irq_enable();
}
- WARN(msgbuf[0], "initcall %pF returned with %s\n", fn, msgbuf);
+ WARN(msgbuf[0], "initcall %pS returned with %s\n", fn, msgbuf);
+ add_latent_entropy();
return ret;
}
-extern initcall_t __initcall_start[];
-extern initcall_t __initcall0_start[];
-extern initcall_t __initcall1_start[];
-extern initcall_t __initcall2_start[];
-extern initcall_t __initcall3_start[];
-extern initcall_t __initcall4_start[];
-extern initcall_t __initcall5_start[];
-extern initcall_t __initcall6_start[];
-extern initcall_t __initcall7_start[];
-extern initcall_t __initcall_end[];
-
-static initcall_t *initcall_levels[] __initdata = {
+static initcall_entry_t *initcall_levels[] __initdata = {
__initcall0_start,
__initcall1_start,
__initcall2_start,
@@ -726,8 +1408,8 @@ static initcall_t *initcall_levels[] __initdata = {
};
/* Keep these in sync with initcalls in include/linux/init.h */
-static char *initcall_level_names[] __initdata = {
- "early",
+static const char *initcall_level_names[] __initdata = {
+ "pure",
"core",
"postcore",
"arch",
@@ -737,28 +1419,44 @@ static char *initcall_level_names[] __initdata = {
"late",
};
-static void __init do_initcall_level(int level)
+static int __init ignore_unknown_bootoption(char *param, char *val,
+ const char *unused, void *arg)
+{
+ return 0;
+}
+
+static void __init do_initcall_level(int level, char *command_line)
{
- extern const struct kernel_param __start___param[], __stop___param[];
- initcall_t *fn;
+ initcall_entry_t *fn;
- strcpy(static_command_line, saved_command_line);
parse_args(initcall_level_names[level],
- static_command_line, __start___param,
+ command_line, __start___param,
__stop___param - __start___param,
level, level,
- &repair_env_string);
+ NULL, ignore_unknown_bootoption);
+ do_trace_initcall_level(initcall_level_names[level]);
for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++)
- do_one_initcall(*fn);
+ do_one_initcall(initcall_from_entry(fn));
}
static void __init do_initcalls(void)
{
int level;
+ size_t len = saved_command_line_len + 1;
+ char *command_line;
+
+ command_line = kzalloc(len, GFP_KERNEL);
+ if (!command_line)
+ panic("%s: Failed to allocate %zu bytes\n", __func__, len);
- for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++)
- do_initcall_level(level);
+ for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++) {
+ /* Parser modifies command_line, restore it each time */
+ strcpy(command_line, saved_command_line);
+ do_initcall_level(level, command_line);
+ }
+
+ kfree(command_line);
}
/*
@@ -771,60 +1469,143 @@ static void __init do_initcalls(void)
static void __init do_basic_setup(void)
{
cpuset_init_smp();
- usermodehelper_init();
- shmem_init();
driver_init();
init_irq_proc();
do_ctors();
- usermodehelper_enable();
do_initcalls();
}
static void __init do_pre_smp_initcalls(void)
{
- initcall_t *fn;
+ initcall_entry_t *fn;
+ do_trace_initcall_level("early");
for (fn = __initcall_start; fn < __initcall0_start; fn++)
- do_one_initcall(*fn);
+ do_one_initcall(initcall_from_entry(fn));
}
-/*
- * This function requests modules which should be loaded by default and is
- * called twice right after initrd is mounted and right before init is
- * exec'd. If such modules are on either initrd or rootfs, they will be
- * loaded before control is passed to userland.
- */
-void __init load_default_modules(void)
+static int run_init_process(const char *init_filename)
{
- load_default_elevator_module();
+ const char *const *p;
+
+ argv_init[0] = init_filename;
+ pr_info("Run %s as init process\n", init_filename);
+ pr_debug(" with arguments:\n");
+ for (p = argv_init; *p; p++)
+ pr_debug(" %s\n", *p);
+ pr_debug(" with environment:\n");
+ for (p = envp_init; *p; p++)
+ pr_debug(" %s\n", *p);
+ return kernel_execve(init_filename, argv_init, envp_init);
}
-static int run_init_process(const char *init_filename)
+static int try_to_run_init_process(const char *init_filename)
{
- argv_init[0] = init_filename;
- return do_execve(init_filename,
- (const char __user *const __user *)argv_init,
- (const char __user *const __user *)envp_init);
+ int ret;
+
+ ret = run_init_process(init_filename);
+
+ if (ret && ret != -ENOENT) {
+ pr_err("Starting init: %s exists but couldn't execute it (error %d)\n",
+ init_filename, ret);
+ }
+
+ return ret;
}
static noinline void __init kernel_init_freeable(void);
+#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX)
+bool rodata_enabled __ro_after_init = true;
+
+#ifndef arch_parse_debug_rodata
+static inline bool arch_parse_debug_rodata(char *str) { return false; }
+#endif
+
+static int __init set_debug_rodata(char *str)
+{
+ if (arch_parse_debug_rodata(str))
+ return 0;
+
+ if (str && !strcmp(str, "on"))
+ rodata_enabled = true;
+ else if (str && !strcmp(str, "off"))
+ rodata_enabled = false;
+ else
+ pr_warn("Invalid option string for rodata: '%s'\n", str);
+ return 0;
+}
+early_param("rodata", set_debug_rodata);
+#endif
+
+static void mark_readonly(void)
+{
+ if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && rodata_enabled) {
+ /*
+ * load_module() results in W+X mappings, which are cleaned
+ * up with init_free_wq. Let's make sure that queued work is
+ * flushed so that we don't hit false positives looking for
+ * insecure pages which are W+X.
+ */
+ flush_module_init_free_work();
+ jump_label_init_ro();
+ mark_rodata_ro();
+ debug_checkwx();
+ rodata_test();
+ } else if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) {
+ pr_info("Kernel memory protection disabled.\n");
+ } else if (IS_ENABLED(CONFIG_ARCH_HAS_STRICT_KERNEL_RWX)) {
+ pr_warn("Kernel memory protection not selected by kernel config.\n");
+ } else {
+ pr_warn("This architecture does not have kernel memory protection.\n");
+ }
+}
+
+void __weak free_initmem(void)
+{
+ free_initmem_default(POISON_FREE_INITMEM);
+}
+
static int __ref kernel_init(void *unused)
{
+ int ret;
+
+ /*
+ * Wait until kthreadd is all set-up.
+ */
+ wait_for_completion(&kthreadd_done);
+
kernel_init_freeable();
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
+
+ system_state = SYSTEM_FREEING_INITMEM;
+ kprobe_free_init_mem();
+ ftrace_free_init_mem();
+ kgdb_free_init_mem();
+ exit_boot_config();
free_initmem();
- mark_rodata_ro();
+ mark_readonly();
+
+ /*
+ * Kernel mappings are now finalized - update the userspace page-table
+ * to finalize PTI.
+ */
+ pti_finalize();
+
system_state = SYSTEM_RUNNING;
numa_default_policy();
- flush_delayed_fput();
+ rcu_end_inkernel_boot();
+
+ do_sysctl_args();
if (ramdisk_execute_command) {
- if (!run_init_process(ramdisk_execute_command))
+ ret = run_init_process(ramdisk_execute_command);
+ if (!ret)
return 0;
- pr_err("Failed to execute %s\n", ramdisk_execute_command);
+ pr_err("Failed to execute %s (error %d)\n",
+ ramdisk_execute_command, ret);
}
/*
@@ -834,28 +1615,49 @@ static int __ref kernel_init(void *unused)
* trying to recover a really broken machine.
*/
if (execute_command) {
- if (!run_init_process(execute_command))
+ ret = run_init_process(execute_command);
+ if (!ret)
return 0;
- pr_err("Failed to execute %s. Attempting defaults...\n",
- execute_command);
+ panic("Requested init %s failed (error %d).",
+ execute_command, ret);
}
- if (!run_init_process("/sbin/init") ||
- !run_init_process("/etc/init") ||
- !run_init_process("/bin/init") ||
- !run_init_process("/bin/sh"))
+
+ if (CONFIG_DEFAULT_INIT[0] != '\0') {
+ ret = run_init_process(CONFIG_DEFAULT_INIT);
+ if (ret)
+ pr_err("Default init %s failed (error %d)\n",
+ CONFIG_DEFAULT_INIT, ret);
+ else
+ return 0;
+ }
+
+ if (!try_to_run_init_process("/sbin/init") ||
+ !try_to_run_init_process("/etc/init") ||
+ !try_to_run_init_process("/bin/init") ||
+ !try_to_run_init_process("/bin/sh"))
return 0;
- panic("No init found. Try passing init= option to kernel. "
- "See Linux Documentation/init.txt for guidance.");
+ panic("No working init found. Try passing init= option to kernel. "
+ "See Linux Documentation/admin-guide/init.rst for guidance.");
}
-static noinline void __init kernel_init_freeable(void)
+/* Open /dev/console, for stdin/stdout/stderr, this should never fail */
+void __init console_on_rootfs(void)
{
- /*
- * Wait until kthreadd is all set-up.
- */
- wait_for_completion(&kthreadd_done);
+ struct file *file = filp_open("/dev/console", O_RDWR, 0);
+
+ if (IS_ERR(file)) {
+ pr_err("Warning: unable to open an initial console.\n");
+ return;
+ }
+ init_dup(file);
+ init_dup(file);
+ init_dup(file);
+ fput(file);
+}
+static noinline void __init kernel_init_freeable(void)
+{
/* Now the scheduler is fully set up and can do blocking allocations */
gfp_allowed_mask = __GFP_BITS_MASK;
@@ -863,38 +1665,42 @@ static noinline void __init kernel_init_freeable(void)
* init can allocate pages on any node
*/
set_mems_allowed(node_states[N_MEMORY]);
- /*
- * init can run on any cpu.
- */
- set_cpus_allowed_ptr(current, cpu_all_mask);
- cad_pid = task_pid(current);
+ cad_pid = get_pid(task_pid(current));
smp_prepare_cpus(setup_max_cpus);
+ workqueue_init();
+
+ init_mm_internals();
+
do_pre_smp_initcalls();
lockup_detector_init();
smp_init();
sched_init_smp();
+ workqueue_init_topology();
+ async_init();
+ padata_init();
+ page_alloc_init_late();
+
do_basic_setup();
- /* Open the /dev/console on the rootfs, this should never fail */
- if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
- pr_err("Warning: unable to open an initial console.\n");
+ kunit_run_all_tests();
+
+ wait_for_initramfs();
+ console_on_rootfs();
- (void) sys_dup(0);
- (void) sys_dup(0);
/*
* check if there is an early userspace init. If yes, let it do all
* the work
*/
-
- if (!ramdisk_execute_command)
- ramdisk_execute_command = "/init";
-
- if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) {
+ int ramdisk_command_access;
+ ramdisk_command_access = init_eaccess(ramdisk_execute_command);
+ if (ramdisk_command_access != 0) {
+ pr_warn("check access for rdinit=%s failed: %i, ignoring\n",
+ ramdisk_execute_command, ramdisk_command_access);
ramdisk_execute_command = NULL;
prepare_namespace();
}
@@ -903,8 +1709,10 @@ static noinline void __init kernel_init_freeable(void)
* Ok, we have completed the initial bootup, and
* we're essentially up and running. Get rid of the
* initmem segments and start the user-mode stuff..
+ *
+ * rootfs is available now, try loading the public keys
+ * and default modules
*/
- /* rootfs is available now, try loading default modules */
- load_default_modules();
+ integrity_load_keys();
}
diff --git a/init/noinitramfs.c b/init/noinitramfs.c
index 267739d85179..d1d26b93d25c 100644
--- a/init/noinitramfs.c
+++ b/init/noinitramfs.c
@@ -1,26 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* init/noinitramfs.c
*
* Copyright (C) 2006, NXP Semiconductors, All Rights Reserved
* Author: Jean-Paul Saman <jean-paul.saman@nxp.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/init.h>
#include <linux/stat.h>
#include <linux/kdev_t.h>
#include <linux/syscalls.h>
+#include <linux/init_syscalls.h>
+#include <linux/umh.h>
/*
* Create a simple rootfs that is similar to the default initramfs
@@ -29,17 +19,17 @@ static int __init default_rootfs(void)
{
int err;
- err = sys_mkdir((const char __user __force *) "/dev", 0755);
+ usermodehelper_enable();
+ err = init_mkdir("/dev", 0755);
if (err < 0)
goto out;
- err = sys_mknod((const char __user __force *) "/dev/console",
- S_IFCHR | S_IRUSR | S_IWUSR,
+ err = init_mknod("/dev/console", S_IFCHR | S_IRUSR | S_IWUSR,
new_encode_dev(MKDEV(5, 1)));
if (err < 0)
goto out;
- err = sys_mkdir((const char __user __force *) "/root", 0700);
+ err = init_mkdir("/root", 0700);
if (err < 0)
goto out;
diff --git a/init/version-timestamp.c b/init/version-timestamp.c
new file mode 100644
index 000000000000..375726e05f69
--- /dev/null
+++ b/init/version-timestamp.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <generated/compile.h>
+#include <generated/utsrelease.h>
+#include <linux/proc_ns.h>
+#include <linux/refcount.h>
+#include <linux/uts.h>
+#include <linux/utsname.h>
+
+struct uts_namespace init_uts_ns = {
+ .ns = NS_COMMON_INIT(init_uts_ns),
+ .name = {
+ .sysname = UTS_SYSNAME,
+ .nodename = UTS_NODENAME,
+ .release = UTS_RELEASE,
+ .version = UTS_VERSION,
+ .machine = UTS_MACHINE,
+ .domainname = UTS_DOMAINNAME,
+ },
+ .user_ns = &init_user_ns,
+};
+
+/* FIXED STRINGS! Don't touch! */
+const char linux_banner[] =
+ "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@"
+ LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION "\n";
diff --git a/init/version.c b/init/version.c
index 1a4718e500fe..94c96f6fbfe6 100644
--- a/init/version.c
+++ b/init/version.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/init/version.c
*
@@ -7,44 +8,47 @@
*/
#include <generated/compile.h>
-#include <linux/module.h>
+#include <linux/build-salt.h>
+#include <linux/elfnote-lto.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/printk.h>
#include <linux/uts.h>
#include <linux/utsname.h>
-#include <generated/utsrelease.h>
-#include <linux/version.h>
#include <linux/proc_ns.h>
-#ifndef CONFIG_KALLSYMS
-#define version(a) Version_ ## a
-#define version_string(a) version(a)
-
-extern int version_string(LINUX_VERSION_CODE);
-int version_string(LINUX_VERSION_CODE);
-#endif
-
-struct uts_namespace init_uts_ns = {
- .kref = {
- .refcount = ATOMIC_INIT(2),
- },
- .name = {
- .sysname = UTS_SYSNAME,
- .nodename = UTS_NODENAME,
- .release = UTS_RELEASE,
- .version = UTS_VERSION,
- .machine = UTS_MACHINE,
- .domainname = UTS_DOMAINNAME,
- },
- .user_ns = &init_user_ns,
- .proc_inum = PROC_UTS_INIT_INO,
-};
-EXPORT_SYMBOL_GPL(init_uts_ns);
+static int __init early_hostname(char *arg)
+{
+ size_t bufsize = sizeof(init_uts_ns.name.nodename);
+ size_t maxlen = bufsize - 1;
+ ssize_t arglen;
-/* FIXED STRINGS! Don't touch! */
-const char linux_banner[] =
- "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@"
- LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION "\n";
+ arglen = strscpy(init_uts_ns.name.nodename, arg, bufsize);
+ if (arglen < 0) {
+ pr_warn("hostname parameter exceeds %zd characters and will be truncated",
+ maxlen);
+ }
+ return 0;
+}
+early_param("hostname", early_hostname);
const char linux_proc_banner[] =
"%s version %s"
" (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ")"
" (" LINUX_COMPILER ") %s\n";
+
+BUILD_SALT;
+BUILD_LTO_INFO;
+
+/*
+ * init_uts_ns and linux_banner contain the build version and timestamp,
+ * which are really fixed at the very last step of build process.
+ * They are compiled with __weak first, and without __weak later.
+ */
+
+struct uts_namespace init_uts_ns __weak;
+const char linux_banner[] __weak;
+
+#include "version-timestamp.c"
+
+EXPORT_SYMBOL_GPL(init_uts_ns);